diff --git a/Makefile b/Makefile index 4084615e..151cca5f 100644 --- a/Makefile +++ b/Makefile @@ -1,2 +1 @@ -obj-y += uds/ obj-y += vdo/ diff --git a/dm-vdo.rst b/dm-vdo.rst new file mode 100644 index 00000000..ebe19a5a --- /dev/null +++ b/dm-vdo.rst @@ -0,0 +1,377 @@ +dm-vdo +====== + +The dm-vdo device mapper target provides block-level deduplication, +compression, and thin provisioning. As a device mapper target, it can add these +features to the storage stack, compatible with any filesystem. VDO does not +protect against data corruption, and relies on the data integrity of the +storage below it, for instance using a RAID device. + +Userspace component +=================== + +Formatting a VDO volume requires the use of the 'vdoformat' tool, available at: + +https://github.com/dm-vdo/vdo/ + +Other userspace tools are also available for inspecting VDO volumes. + +It is strongly recommended that lvm be used to manage VDO volumes. See +lvm-vdo(7). + +Target interface +================ + +Table line +---------- + +:: + + vdo V4 + + [optional arguments] + + +Required parameters: + + offset: + The offset, in sectors, at which the VDO volume's + logical space begins. + + logical device size: + The size of the device which the VDO volume will + service, in sectors. Must match the current logical size + of the VDO volume. + + storage device: + The device holding the VDO volume's data and metadata. + + storage device size: + The size of the device to use for the VDO volume, as a + number of 4096-byte blocks. Must match the current size + of the VDO volume. + + minimum IO size: + The minimum IO size for this VDO volume to accept, in + bytes. Valid values are 512 or 4096. The recommended + value is 4096. + + block map cache size: + The size of the block map cache, as a number of 4096-byte + blocks. The minimum and recommended value is 32768 + blocks. If the logical thread count is non-zero, the + cache size must be at least 4096 blocks per logical + thread. + + block map era length: + The speed with which the block map cache writes out + modified block map pages. A smaller era length is + likely to reduce the amount time spent rebuilding, at + the cost of increased block map writes during normal + operation. The maximum and recommended value is 16380; + the minimum value is 1. + +Optional parameters: +-------------------- +Some or all of these parameters may be specified as pairs. + +Thread related parameters: + +VDO assigns different categories of work to separate thread groups, and the +number of threads in each group can be configured separately. Most groups can +use up to 100 threads. + +If , , and are all set to 0, the work handled by all +three thread types will be handled by a single thread. If any of these values +are non-zero, all of them must be non-zero. + + ack: + The number of threads used to complete bios. Since + completing a bio calls an arbitrary completion function + outside the VDO volume, threads of this type allow the + VDO volume to continue processing requests even when bio + completion is slow. The default is 1. + + bio: + The number of threads used to issue bios to the + underlying storage. Threads of this type allow the VDO + volume to continue processing requests even when bio + submission is slow. The default is 4. + + bioRotationInterval: + The number of bios to enqueue on each bio thread before + switching to the next thread. The value must be greater + than 0 and not more than 1024; the default is 64. + + cpu: + The number of threads used to do CPU-intensive work, + such as hashing and compression. The default is 1. + + hash: + The number of threads used to manage data comparisons for + deduplication based on the hash value of data blocks. + Default is 1. + + logical: + The number of threads used to manage caching and locking based + on the logical address of incoming bios. The default is 0; the + maximum is 60. + + physical: + The number of threads used to manage administration of + the underlying storage device. At format time, a slab + size for the VDO is chosen; the VDO storage device must + be large enough to have at least 1 slab per physical + thread. The default is 0; the maximum is 16. + +Miscellaneous parameters: + + maxDiscard: + The maximum size of discard bio accepted, in 4096-byte + blocks. I/O requests to a VDO volume are normally split + into 4096-byte blocks, and processed up to 2048 at a + time. However, discard requests to a VDO volume can be + automatically split to a larger size, up to + 4096-byte blocks in a single bio, and are + limited to 1500 at a time. Increasing this value may + provide better overall performance, at the cost of + increased latency for the individual discard requests. + The default and minimum is 1; the maximum is + UINT_MAX / 4096. + + deduplication: + Whether deduplication should be started. The default is 'on'; + the acceptable values are 'on' and 'off'. + +Device modification +------------------- + +A modified table may be loaded into a running, non-suspended VDO volume. The +modifications will take effect when the device is next resumed. The modifiable +parameters are , , , +, and . + +If the logical device size or physical device size are changed, upon successful +resume VDO will store the new values and require them on future startups. These +two parameters may not be decreased. The logical device size may not exceed 4 +PB. The physical device size must increase by at least 32832 4096-byte blocks +if at all, and must not exceed the size of the underlying storage device. +Additionally, when formatting the VDO device, a slab size is chosen: the +physical device size may never increase above the size which provides 8192 +slabs, and each increase must be large enough to add at least one new slab. + + +Examples: + +Start a previously-formatted VDO volume with 1G logical space and 1G physical +space, storing to /dev/dm-1 which has more than 1G space. + +:: + + dmsetup create vdo0 --table \ + "0 2097152 vdo V4 /dev/dm-1 262144 4096 32768 16380" + +Grow the logical size to 4G. + +:: + + dmsetup reload vdo0 --table \ + "0 8388608 vdo V4 /dev/dm-1 262144 4096 32768 16380" + dmsetup resume vdo0 + +Grow the physical size to 2G. + +:: + + dmsetup reload vdo0 --table \ + "0 8388608 vdo V4 /dev/dm-1 524288 4096 32768 16380" + dmsetup resume vdo0 + +Grow the physical size by 1G more and increase max discard sectors. + +:: + + dmsetup reload vdo0 --table \ + "0 10485760 vdo V4 /dev/dm-1 786432 4096 32768 16380 maxDiscard 8" + dmsetup resume vdo0 + +Stop the VDO volume. + +:: + + dmsetup remove vdo0 + +Start the VDO volume again. Note that the logical and physical device sizes +must still match, but other parameters can change. + +:: + + dmsetup create vdo1 --table \ + "0 10485760 vdo V2 /dev/dm-1 786432 512 65550 5000 hash 1 logical 3 physical 2" + +Messages +-------- +VDO devices accept several messages for adjusting various parameters on the +fly. All of them may be sent in the form + +:: + dmsetup message vdo1 0 + +Possible messages are: + + stats: Outputs the current view of the VDO statistics. Mostly used by + the vdostats userspace program to interpret the output buffer. + + dump: Dumps many internal structures to the system log. This is not + always safe to run, so it should only be used to debug a + hung VDO. Optional params to specify structures to dump are: + + viopool: The pool of structures used for user IO inside VDO + pools: A synonym of 'viopool' + vdo: Most of the structures managing on-disk data + queues: Basic information about each thread VDO is using + threads: A synonym of 'queues' + default: Equivalent to 'queues vdo' + all: All of the above. + + dump-on-shutdown: Perform a default dump next time VDO shuts down. + + compression: Can be used to change whether compression is enabled + without shutting VDO down. Must have either "on" or "off" + specified. + + index-create: Reformat the deduplication index belonging to this VDO. + + index-close: Turn off and save the deduplication index belonging to + this VDO. + + index-enable: Enable deduplication. + index-disable: Disable deduplication. + + +Status +------ + +:: + + + + + device: + The name of the VDO volume. + + operating mode: + The current operating mode of the VDO volume; values + may be 'normal', 'recovering' (the volume has + detected an issue with its metadata and is attempting + to repair itself), and 'read-only' (an error has + occurred that forces the vdo volume to only support + read operations and not writes). + + in recovery: + Whether the VDO volume is currently in recovery + mode; values may be 'recovering' or '-' which + indicates not recovering. + + index state: + The current state of the deduplication index in the + VDO volume; values may be 'closed', 'closing', + 'error', 'offline', 'online', 'opening', and + 'unknown'. + + compression state: + The current state of compression in the VDO volume; + values may be 'offline' and 'online'. + + used physical blocks: + The number of physical blocks in use by the VDO + volume. + + total physical blocks: + The total number of physical blocks the VDO volume + may use; the difference between this value and the + is the number of blocks the + VDO volume has left before being full. + +Runtime Use of VDO +================== + +There are a couple of crucial VDO behaviors to keep in mind when running +workloads on VDO: + +- When blocks are no longer in use, sending a discard request for those blocks + lets VDO potentially mark that space as free. Future read requests to that + block can then instantly return after reading the block map, instead of + performing an IO request to read the outdated block. Additionally, if the + VDO is thin provisioned, discarding unused blocks is absolutely necessary to + prevent VDO from running out of space. For a filesystem, mounting with the + -o discard or running fstrim regularly will perform the necessary discards. + +- VDO is resilient against crashes if the underlying storage properly honors + flush requests, and itself properly honors flushes. Specifically, when VDO + receives a write request, it will allocate space for that request and then + complete the request; there is no guarantee that the request will be + reflected on disk until a subsequent flush request to VDO is finished. If + a filesystem is in use on the VDO device, files should be fsync()d after + being written in order for their data to be reliably persisted. + +- VDO has high throughput at high IO depths -- it can process up to 2048 IO + requests in parallel. While much of this processing happens after incoming + requests are finished, for optimal performance a medium IO depth will + greatly improve over a low IO depth. + +Tuning +====== + +The VDO device has many options, and it can be difficult to choose optimal +choices without perfect knowledge of the workload. Additionally, most +configuration options must be set when VDO is started, and cannot be changed +without shutting the VDO down completely, so the configuration cannot be +easily changed while the workload is active. Therefore, before using VDO in +production, the optimal values for your hardware should be chosen by using +a simulated workload. + +The most important value to adjust is the block map cache size. VDO maintains +a table of mappings from logical block addresses to physical block addresses +in its block map, and VDO must look up the relevant mapping in the cache when +accessing any particular block. By default, VDO allocates 128 MB of metadata +cache in RAM to support efficient access to 100 GB of logical space at a time. + +Working sets larger than the size that can fit in the configured block map +cache size will require additional I/O to service requests, thus reducing +performance. If the working set is larger than 100G, the block map cache size +should be scaled accordingly. + +The logical and physical thread counts should also be adjusted. A logical +thread controls a disjoint section of the block map, so additional logical +threads increase parallelism and can increase throughput. Physical threads +control a disjoint section of the data blocks, so additional physical threads +can increase throughput also. However, excess threads can waste resources and +increase contention. + +Bio submission threads control the parallelism involved in sending IOs to the +underlying storage; fewer threads mean there is more opportunity to reorder +IO requests for performance benefit, but also that each IO request has to wait +longer before being submitted. + +Bio acknowledgement threads control parallelism in finishing IO requests when +VDO is ready to mark them as done. Usually one is sufficient, but occasionally +especially if the bio has a CPU-heavy callback function multiple are needed. + +CPU threads are used for hashing and for compression; in workloads with +compression enabled, more threads may result in higher throughput. + +Hash threads are used to sort active requests by hash and determine whether +they should deduplicate; the most CPU intensive actions done by these threads +are comparison of 4096-byte data blocks. + +The optimal thread config varies by your precise hardware and configuration, +and the default values are reasonable choices. For one high-end, many-CPU +system with NVMe storage, the best throughput has been seen with 4 logical, 3 +physical, 4 cpu, 2 hash, 8 bio, and 2 ack threads. + + + .. + Version History + =============== + TODO diff --git a/kvdo.spec b/kvdo.spec index 7ea21cb6..9666cbb7 100644 --- a/kvdo.spec +++ b/kvdo.spec @@ -1,6 +1,6 @@ %define spec_release 1 %define kmod_name kvdo -%define kmod_driver_version 8.1.1.371 +%define kmod_driver_version 8.2.0.2 %define kmod_rpm_release %{spec_release} %define kmod_kernel_version 3.10.0-693.el7 @@ -27,7 +27,7 @@ BuildRequires: glibc %if 0%{?rhel} && 0%{?rhel} < 9 # Fedora doesn't have abi whitelists, # And RHEL9 doesn't have it yet. -BuildRequires: kernel-abi-whitelists +BuildRequires: kernel-abi-whitelists %endif BuildRequires: libuuid-devel BuildRequires: redhat-rpm-config @@ -53,9 +53,9 @@ set -x /usr/sbin/dkms --rpm_safe_upgrade install -m %{kmod_name} -v %{version} %preun -# Check whether kvdo or uds is loaded, and if so attempt to remove it. A -# failure here means there is still something using the module, which should be -# cleared up before attempting to remove again. +# Check whether kvdo is loaded, and if so attempt to remove it. A +# failure here means there is still something using the module, which +# should be cleared up before attempting to remove again. for module in kvdo uds; do if grep -q "^${module}" /proc/modules; then modprobe -r ${module} @@ -78,15 +78,12 @@ PACKAGE_NAME="kvdo" PACKAGE_VERSION="%{version}" AUTOINSTALL="yes" -BUILT_MODULE_NAME[0]="uds" -BUILT_MODULE_LOCATION[0]="uds" +BUILT_MODULE_NAME[0]="kvdo" +BUILT_MODULE_LOCATION[0]="vdo" DEST_MODULE_LOCATION[0]="/kernel/drivers/block/" +BUILD_DEPENDS[0]=LZ4_COMPRESS +BUILD_DEPENDS[0]=LZ4_DECOMPRESS STRIP[0]="no" - -BUILT_MODULE_NAME[1]="kvdo" -BUILT_MODULE_LOCATION[1]="vdo" -DEST_MODULE_LOCATION[1]="/kernel/drivers/block/" -STRIP[1]="no" EOF %clean @@ -97,5 +94,5 @@ rm -rf $RPM_BUILD_ROOT %{_usr}/src/%{kmod_name}-%{version} %changelog -* Thu Mar 03 2022 - Red Hat VDO Team - 8.1.1.371-1 +* Sun Jul 17 2022 - Red Hat VDO Team - 8.2.0.2-1 - See https://github.com/dm-vdo/kvdo.git diff --git a/uds/Makefile b/uds/Makefile deleted file mode 100644 index 7b66444e..00000000 --- a/uds/Makefile +++ /dev/null @@ -1,16 +0,0 @@ -UDS_VERSION = 8.1.1.70 - -SOURCES = $(notdir $(wildcard $(src)/*.c)) murmur/MurmurHash3.c -SOURCES += $(addprefix util/,$(notdir $(wildcard $(src)/util/*.c))) -OBJECTS = $(SOURCES:%.c=%.o) -INCLUDES = -I$(src) - -EXTRA_CFLAGS = -fno-builtin-memset \ - $(if $(CONFIG_KASAN),,-Wframe-larger-than=400) \ - -Wno-declaration-after-statement \ - -DUDS_VERSION=\"$(UDS_VERSION)\" \ - $(INCLUDES) - -obj-m += uds.o - -uds-objs = $(OBJECTS) diff --git a/uds/bits.c b/uds/bits.c deleted file mode 100644 index 8ecd00f5..00000000 --- a/uds/bits.c +++ /dev/null @@ -1,198 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/krusty/src/uds/bits.c#10 $ - */ - -#include "bits.h" - -#include "compiler.h" - -/** - * This is the largest field size supported by get_big_field & set_big_field. - * Any field that is larger is not guaranteed to fit in a single, byte - * aligned uint64_t. - **/ -enum { MAX_BIG_FIELD_BITS = (sizeof(uint64_t) - 1) * CHAR_BIT + 1 }; - -/** - * Get a big bit field from a bit stream - * - * @param memory The base memory byte address - * @param offset The bit offset into the memory for the start of the field - * @param size The number of bits in the field - * - * @return the bit field - **/ -static INLINE uint64_t get_big_field(const byte *memory, - uint64_t offset, - int size) -{ - const void *addr = memory + offset / CHAR_BIT; - return (get_unaligned_le64(addr) >> (offset % CHAR_BIT)) & - ((1UL << size) - 1); -} - -/** - * Set a big bit field in a bit stream - * - * @param value The value to put into the field - * @param memory The base memory byte address - * @param offset The bit offset into the memory for the start of the field - * @param size The number of bits in the field - **/ -static INLINE void -set_big_field(uint64_t value, byte *memory, uint64_t offset, int size) -{ - void *addr = memory + offset / CHAR_BIT; - int shift = offset % CHAR_BIT; - uint64_t data = get_unaligned_le64(addr); - data &= ~(((1UL << size) - 1) << shift); - data |= value << shift; - put_unaligned_le64(data, addr); -} - -/**********************************************************************/ -void get_bytes(const byte *memory, uint64_t offset, byte *destination, int size) -{ - const byte *addr = memory + offset / CHAR_BIT; - int shift = offset % CHAR_BIT; - while (--size >= 0) { - *destination++ = get_unaligned_le16(addr++) >> shift; - } -} - -/**********************************************************************/ -void set_bytes(byte *memory, uint64_t offset, const byte *source, int size) -{ - byte *addr = memory + offset / CHAR_BIT; - int shift = offset % CHAR_BIT; - uint16_t mask = ~((uint16_t) 0xFF << shift); - while (--size >= 0) { - uint16_t data = (get_unaligned_le16(addr) & mask) | - (*source++ << shift); - put_unaligned_le16(data, addr++); - } -} - -/**********************************************************************/ -void move_bits(const byte *s_memory, - uint64_t source, - byte *d_memory, - uint64_t destination, - int size) -{ - enum { UINT32_BIT = sizeof(uint32_t) * CHAR_BIT }; - if (size > MAX_BIG_FIELD_BITS) { - if (source > destination) { - const byte *src; - byte *dest; - int offset; - // This is a large move from a higher to a lower - // address. We move the lower addressed bits first. - // Start by moving one field that ends on a destination - // int boundary - int count = - MAX_BIG_FIELD_BITS - - (destination + MAX_BIG_FIELD_BITS) % UINT32_BIT; - uint64_t field = - get_big_field(s_memory, source, count); - set_big_field(field, d_memory, destination, count); - source += count; - destination += count; - size -= count; - // Now do the main loop to copy 32 bit chunks that are - // int-aligned at the destination. - offset = source % UINT32_BIT; - src = s_memory + (source - offset) / CHAR_BIT; - dest = d_memory + destination / CHAR_BIT; - while (size > MAX_BIG_FIELD_BITS) { - put_unaligned_le32(get_unaligned_le64(src) >> - offset, - dest); - src += sizeof(uint32_t); - dest += sizeof(uint32_t); - source += UINT32_BIT; - destination += UINT32_BIT; - size -= UINT32_BIT; - } - } else { - const byte *src; - byte *dest; - // This is a large move from a lower to a higher - // address. We move the higher addressed bits first. - // Start by moving one field that begins on a - // destination int boundary - int offset, count = (destination + size) % UINT32_BIT; - if (count > 0) { - uint64_t field; - size -= count; - field = get_big_field(s_memory, source + size, - count); - set_big_field(field, - d_memory, - destination + size, - count); - } - // Now do the main loop to copy 32 bit chunks that are - // int-aligned at the destination. - offset = (source + size) % UINT32_BIT; - src = s_memory + (source + size - offset) / CHAR_BIT; - dest = d_memory + (destination + size) / CHAR_BIT; - while (size > MAX_BIG_FIELD_BITS) { - src -= sizeof(uint32_t); - dest -= sizeof(uint32_t); - size -= UINT32_BIT; - put_unaligned_le32(get_unaligned_le64(src) >> - offset, - dest); - } - } - } - // Finish up by doing the last chunk, which can have any arbitrary - // alignment - if (size > 0) { - uint64_t field = get_big_field(s_memory, source, size); - set_big_field(field, d_memory, destination, size); - } -} - -/**********************************************************************/ -bool same_bits(const byte *mem1, - uint64_t offset1, - const byte *mem2, - uint64_t offset2, - int size) -{ - while (size >= MAX_FIELD_BITS) { - unsigned int field1 = get_field(mem1, offset1, MAX_FIELD_BITS); - unsigned int field2 = get_field(mem2, offset2, MAX_FIELD_BITS); - if (field1 != field2) - return false; - offset1 += MAX_FIELD_BITS; - offset2 += MAX_FIELD_BITS; - size -= MAX_FIELD_BITS; - } - if (size > 0) { - unsigned int field1 = get_field(mem1, offset1, size); - unsigned int field2 = get_field(mem2, offset2, size); - if (field1 != field2) - return false; - } - return true; -} diff --git a/uds/bits.h b/uds/bits.h deleted file mode 100644 index c3a28335..00000000 --- a/uds/bits.h +++ /dev/null @@ -1,204 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/krusty/src/uds/bits.h#8 $ - */ - -#ifndef BITS_H -#define BITS_H 1 - -#include "compiler.h" -#include "numeric.h" -#include "typeDefs.h" - -/* - * These bit stream and bit field utility routines are used for the - * non-byte aligned delta indices. - * - * Bits and bytes are numbered in little endian order. For example: Within - * a byte, bit 0 is the least significant bit (0x1), and bit 7 is the most - * significant bit (0x80). Within a bit stream, bit 7 is the most - * signficant bit of byte 0, and bit 8 is the least significant bit of byte - * 1. Within a byte array, a byte's number corresponds to it's index in - * the array. - * - * The implementation assumes that the native machine is little endian, and - * that performance is very important. These assumptions match our current - * operating environment. - */ - -/** - * This is the largest field size supported by get_field & set_field. Any - * field that is larger is not guaranteed to fit in a single, byte aligned - * uint32_t. - **/ -enum { MAX_FIELD_BITS = (sizeof(uint32_t) - 1) * CHAR_BIT + 1 }; - -/** - * This is the number of guard bytes needed at the end of the memory byte - * array when using the bit utilities. 3 bytes are needed when get_field & - * set_field access a field, because they will access some "extra" bytes - * past the end of the field. And 7 bytes are needed when getBigField & - * set_big_field access a big field, for the same reason. Note that move_bits - * calls get_big_field & set_big_field. 7 is rewritten to make it clear how it - * is derived. - **/ -enum { POST_FIELD_GUARD_BYTES = sizeof(uint64_t) - 1 }; - -/** - * Get a bit field from a bit stream - * - * @param memory The base memory byte address - * @param offset The bit offset into the memory for the start of the field - * @param size The number of bits in the field - * - * @return the bit field - **/ -static INLINE unsigned int -get_field(const byte *memory, uint64_t offset, int size) -{ - const void *addr = memory + offset / CHAR_BIT; - return (get_unaligned_le32(addr) >> (offset % CHAR_BIT)) & - ((1 << size) - 1); -} - -/** - * Set a bit field in a bit stream - * - * @param value The value to put into the field - * @param memory The base memory byte address - * @param offset The bit offset into the memory for the start of the field - * @param size The number of bits in the field - **/ -static INLINE void -set_field(unsigned int value, byte *memory, uint64_t offset, int size) -{ - void *addr = memory + offset / CHAR_BIT; - int shift = offset % CHAR_BIT; - uint32_t data = get_unaligned_le32(addr); - data &= ~(((1 << size) - 1) << shift); - data |= value << shift; - put_unaligned_le32(data, addr); -} - -/** - * Set a bit field in a bit stream to all ones - * - * @param memory The base memory byte address - * @param offset The bit offset into the memory for the start of the field - * @param size The number of bits in the field - **/ -static INLINE void set_one(byte *memory, uint64_t offset, int size) -{ - if (size > 0) { - byte *addr = memory + offset / CHAR_BIT; - int shift = offset % CHAR_BIT; - int count = size + shift > CHAR_BIT ? CHAR_BIT - shift : size; - *addr++ |= ((1 << count) - 1) << shift; - for (size -= count; size > CHAR_BIT; size -= CHAR_BIT) { - *addr++ = 0xFF; - } - if (size) { - *addr |= ~(0xFF << size); - } - } -} - -/** - * Set a bit field in a bit stream to all zeros - * - * @param memory The base memory byte address - * @param offset The bit offset into the memory for the start of the field - * @param size The number of bits in the field - **/ -static INLINE void set_zero(byte *memory, uint64_t offset, int size) -{ - if (size > 0) { - byte *addr = memory + offset / CHAR_BIT; - int shift = offset % CHAR_BIT; - int count = size + shift > CHAR_BIT ? CHAR_BIT - shift : size; - *addr++ &= ~(((1 << count) - 1) << shift); - for (size -= count; size > CHAR_BIT; size -= CHAR_BIT) { - *addr++ = 0; - } - if (size) { - *addr &= 0xFF << size; - } - } -} - -/** - * Get a byte stream from a bit stream, reading a whole number of bytes - * from an arbitrary bit boundary. - * - * @param memory The base memory byte address for the bit stream - * @param offset The bit offset of the start of the bit stream - * @param destination Where to store the bytes - * @param size The number of bytes - **/ -void get_bytes(const byte *memory, - uint64_t offset, - byte *destination, - int size); - -/** - * Store a byte stream into a bit stream, writing a whole number of bytes - * to an arbitrary bit boundary. - * - * @param memory The base memory byte address for the bit stream - * @param offset The bit offset of the start of the bit stream - * @param source Where to read the bytes - * @param size The number of bytes - **/ -void set_bytes(byte *memory, uint64_t offset, const byte *source, int size); - -/** - * Move bits from one field to another. When the fields overlap, behave as - * if we first move all the bits from the source to a temporary value, and - * then move all the bits from the temporary value to the destination. - * - * @param s_memory The base source memory byte address - * @param source Bit offset into memory for the source start - * @param d_memory The base destination memory byte address - * @param destination Bit offset into memory for the destination start - * @param size The number of bits in the field - **/ -void move_bits(const byte *s_memory, - uint64_t source, - byte *d_memory, - uint64_t destination, - int size); - -/** - * Compare bits from one field to another, testing for sameness - * - * @param mem1 The base memory byte address (first field) - * @param offset1 Bit offset into the memory for the start (first field) - * @param mem2 The base memory byte address (second field) - * @param offset2 Bit offset into the memory for the start (second field) - * @param size The number of bits in the field - * - * @return true if fields are the same, false if different - **/ -bool __must_check same_bits(const byte *mem1, - uint64_t offset1, - const byte *mem2, - uint64_t offset2, - int size); - -#endif /* BITS_H */ diff --git a/uds/buffer.h b/uds/buffer.h deleted file mode 100644 index bfb5d5fb..00000000 --- a/uds/buffer.h +++ /dev/null @@ -1,471 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/krusty/src/uds/buffer.h#9 $ - */ - -#ifndef BUFFER_H -#define BUFFER_H - -#include "common.h" - -struct buffer; - -/** - * Create a buffer which wraps an existing byte array. - * - * @param bytes The bytes to wrap - * @param length The length of the buffer - * @param content_length The length of the current contents of the buffer - * @param buffer_ptr A pointer to hold the buffer - * - * @return UDS_SUCCESS or an error code - **/ -int __must_check wrap_buffer(byte *bytes, - size_t length, - size_t content_length, - struct buffer **buffer_ptr); - -/** - * Create a new buffer and allocate its memory. - * - * @param length The length of the buffer - * @param buffer_ptr A pointer to hold the buffer - * - * @return UDS_SUCCESS or an error code - **/ -int __must_check make_buffer(size_t length, struct buffer **buffer_ptr); - -/** - * Release a buffer and, if not wrapped, free its memory. - * - * @param buffer The buffer to release - **/ -void free_buffer(struct buffer *buffer); - -/** - * Ensure that a buffer has a given amount of space available, compacting the - * buffer if necessary. - * - * @param buffer The buffer - * @param bytes The number of available bytes desired - * - * @return true if the requested number of bytes are now available - **/ -bool __must_check ensure_available_space(struct buffer *buffer, size_t bytes); - -/** - * Clear the buffer. The start position is set to zero and the end position - * is set to the buffer length. - **/ -void clear_buffer(struct buffer *buffer); - -/** - * Eliminate buffer contents which have been extracted. This function copies - * any data between the start and end pointers to the beginning of the buffer, - * moves the start pointer to the beginning, and the end pointer to the end - * of the copied data. - * - * @param buffer The buffer to compact - **/ -void compact_buffer(struct buffer *buffer); - -/** - * Skip forward the specified number of bytes in a buffer (advance the - * start pointer). - * - * @param buffer The buffer - * @param bytes_to_skip The number of bytes to skip - * - * @return UDS_SUCCESS or UDS_BUFFER_ERROR if the buffer is not long - * enough to skip forward the requested number of bytes - **/ -int __must_check skip_forward(struct buffer *buffer, size_t bytes_to_skip); - -/** - * Rewind the specified number of bytes in a buffer (back up the start - * pointer). - * - * @param buffer The buffer - * @param bytes_to_rewind The number of bytes to rewind - * - * @return UDS_SUCCESS or UDS_BUFFER_ERROR if the buffer is not long - * enough to rewind backward the requested number of bytes - **/ -int __must_check rewind_buffer(struct buffer *buffer, size_t bytes_to_rewind); - -/** - * Return the length of the buffer. - * - * @param buffer the buffer - * - * @return the buffer length - **/ -size_t buffer_length(struct buffer *buffer); - -/** - * Compute the amount of data current in the buffer. - * - * @param buffer The buffer to examine - * - * @return The number of bytes between the start and end pointers of the buffer - **/ -size_t content_length(struct buffer *buffer); - -/** - * Compute the amount of available space in this buffer. - * - * @param buffer The buffer to examine - * - * @return The number of bytes between the end pointer and the end of the buffer - **/ -size_t available_space(struct buffer *buffer); - -/** - * Amount of buffer that has already been processed. - * - * @param buffer the buffer to examine - * - * @return The number of bytes between the beginning of the buffer and the - * start pointer. - **/ -size_t uncompacted_amount(struct buffer *buffer); - -/** - * Return the amount of the buffer that is currently utilized. - * - * @param buffer the buffer to examine - * - * @return The number of bytes between the beginning of the buffer and - * the end pointer. - **/ -size_t buffer_used(struct buffer *buffer); - -/** - * Reset the end of buffer to a different position. - * - * @param buffer the buffer - * @param end the new end of the buffer - * - * @return UDS_SUCCESS unless the end is larger than can fit - **/ -int __must_check reset_buffer_end(struct buffer *buffer, size_t end); - -/** - * Check whether the start of the content of a buffer matches a specified - * array of bytes. - * - * @param buffer The buffer to check - * @param data The desired data - * @param length The length of the desired data - * - * @return true if the first length bytes of the buffer's - * contents match data - **/ -bool __must_check -has_same_bytes(struct buffer *buffer, const byte *data, size_t length); - -/** - * Check whether two buffers have the same contents. - * - * @param buffer1 The first buffer - * @param buffer2 The second buffer - * - * @return true if the contents of the two buffers are the - * same - **/ -bool equal_buffers(struct buffer *buffer1, struct buffer *buffer2); - -/** - * Get a single byte from a buffer and advance the start pointer. - * - * @param buffer The buffer - * @param byte_ptr A pointer to hold the byte - * - * @return UDS_SUCCESS or UDS_BUFFER_ERROR if there are no bytes to - * retrieve - **/ -int __must_check get_byte(struct buffer *buffer, byte *byte_ptr); - -/** - * Put a single byte into a buffer and advance the end pointer. - * - * @param buffer The buffer - * @param b The byte to put - * - * @return UDS_SUCCESS or UDS_BUFFER_ERROR if there is no space in the buffer - **/ -int __must_check put_byte(struct buffer *buffer, byte b); - -/** - * Get bytes out of a buffer and advance the start of the buffer past the - * copied data. - * - * @param buffer The buffer from which to copy - * @param length The number of bytes to copy - * @param destination A pointer to hold the data - * - * @return UDS_SUCCESS or an error code - **/ -int __must_check -get_bytes_from_buffer(struct buffer *buffer, size_t length, void *destination); - -/** - * Get a pointer to the current contents of the buffer. This will be a pointer - * to the actual memory managed by the buffer. It is the caller's responsibility - * to ensure that the buffer is not modified while this pointer is in use. - * - * @param buffer The buffer from which to get the contents - * - * @return a pointer to the current contents of the buffer - **/ -byte *get_buffer_contents(struct buffer *buffer); - -/** - * Copy bytes out of a buffer and advance the start of the buffer past the - * copied data. Memory will be allocated to hold the copy. - * - * @param buffer The buffer from which to copy - * @param length The number of bytes to copy - * @param destination_ptr A pointer to hold the copied data - * - * @return UDS_SUCCESS or an error code - **/ -int __must_check -copy_bytes(struct buffer *buffer, size_t length, byte **destination_ptr); - -/** - * Copy bytes into a buffer and advance the end of the buffer past the - * copied data. - * - * @param buffer The buffer to copy into - * @param length The length of the data to copy - * @param source The data to copy - * - * @return UDS_SUCCESS or UDS_BUFFER_ERROR if the buffer does not have - * length bytes available - **/ -int __must_check -put_bytes(struct buffer *buffer, size_t length, const void *source); - -/** - * Copy the contents of a source buffer into the target buffer. Advances the - * start of the source buffer and the end of the target buffer past the copied - * data. - * - * @param target The buffer to receive the copy of the data - * @param source The buffer containing the data to copy - * @param length The length of the data to copy - * - * @return UDS_SUCCESS or UDS_BUFFER_ERROR if the target buffer does not have - * length bytes available or if the source buffer does not have length - * bytes of content - **/ -int __must_check -put_buffer(struct buffer *target, struct buffer *source, size_t length); - -/** - * Zero bytes in a buffer starting at the start pointer, and advance the - * end of the buffer past the zeros. - * - * @param buffer The buffer to zero - * @param length The number of bytes to zero - * - * @return UDS_SUCCESS or UDS_BUFFER_ERROR if the buffer does not have - * length bytes available - **/ -int __must_check zero_bytes(struct buffer *buffer, size_t length); - -/** - * Get a boolean value from a buffer and advance the start pointer. - * - * @param buffer The buffer - * @param b A pointer to hold the boolean value - * - * @return UDS_SUCCESS or UDS_BUFFER_ERROR if there is not enough data - * in the buffer - **/ -int __must_check get_boolean(struct buffer *buffer, bool *b); - -/** - * Put a boolean value into a buffer and advance the end pointer. - * - * @param buffer The buffer - * @param b The boolean to put - * - * @return UDS_SUCCESS or UDS_BUFFER_ERROR if there is no space in the buffer - **/ -int __must_check put_boolean(struct buffer *buffer, bool b); - -/** - * Get a 2 byte, little endian encoded integer from a buffer and - * advance the start pointer past it. - * - * @param buffer The buffer - * @param ui A pointer to hold the integer - * - * @return UDS_SUCCESS or UDS_BUFFER_ERROR if there are fewer than 2 - * bytes available - **/ -int __must_check get_uint16_le_from_buffer(struct buffer *buffer, uint16_t *ui); - -/** - * Put a 2 byte, little endian encoded integer into a buffer and advance the - * end pointer past it. - * - * @param buffer The buffer - * @param ui The integer to put - * - * @return UDS_SUCCESS or UDS_BUFFER_ERROR if there are fewer than 2 - * bytes available - **/ -int __must_check put_uint16_le_into_buffer(struct buffer *buffer, uint16_t ui); - -/** - * Get a series of 2 byte, little endian encoded integer from a buffer - * and advance the start pointer past them. - * - * @param buffer The buffer - * @param count The number of integers to get - * @param ui A pointer to hold the integers - * - * @return UDS_SUCCESS or UDS_BUFFER_ERROR if there is not enough data - * in the buffer - **/ -int __must_check -get_uint16_les_from_buffer(struct buffer *buffer, size_t count, uint16_t *ui); - -/** - * Put a series of 2 byte, little endian encoded integers into a - * buffer and advance the end pointer past them. - * - * @param buffer The buffer - * @param count The number of integers to put - * @param ui A pointer to the integers - * - * @return UDS_SUCCESS or UDS_BUFFER_ERROR if there is not enough space - * in the buffer - **/ -int __must_check -put_uint16_les_into_buffer(struct buffer *buffer, - size_t count, - const uint16_t *ui); - -/** - * Get a 4 byte, little endian encoded integer from a buffer and advance the - * start pointer past it. - * - * @param buffer The buffer - * @param i A pointer to hold the integer - * - * @return UDS_SUCCESS or UDS_BUFFER_ERROR if there are fewer than 4 - * bytes available - **/ -int __must_check get_int32_le_from_buffer(struct buffer *buffer, int32_t *i); - -/** - * Get a 4 byte, little endian encoded integer from a buffer and advance the - * start pointer past it. - * - * @param buffer The buffer - * @param ui A pointer to hold the integer - * - * @return UDS_SUCCESS or UDS_BUFFER_ERROR if there are fewer than 4 - * bytes available - **/ -int __must_check get_uint32_le_from_buffer(struct buffer *buffer, uint32_t *ui); - -/** - * Put a 4 byte, little endian encoded integer into a buffer and advance the - * end pointer past it. - * - * @param buffer The buffer - * @param ui The integer to put - * - * @return UDS_SUCCESS or UDS_BUFFER_ERROR if there are fewer than 4 - * bytes available - **/ -int __must_check put_uint32_le_into_buffer(struct buffer *buffer, uint32_t ui); - -/** - * Get an 8 byte, little endian encoded, unsigned integer from a - * buffer and advance the start pointer past it. - * - * @param buffer The buffer - * @param ui A pointer to hold the integer - * - * @return UDS_SUCCESS or UDS_BUFFER_ERROR if there are fewer than 8 - * bytes available - **/ -int __must_check get_uint64_le_from_buffer(struct buffer *buffer, uint64_t *ui); - -/** - * Put an 8 byte, little endian encoded signed integer into a buffer - * and advance the end pointer past it. - * - * @param buffer The buffer - * @param i The integer to put - * - * @return UDS_SUCCESS or UDS_BUFFER_ERROR if there are fewer than 8 - * bytes available - **/ -int __must_check put_int64_le_into_buffer(struct buffer *buffer, int64_t i); - -/** - * Put an 8 byte, little endian encoded integer into a buffer and advance the - * end pointer past it. - * - * @param buffer The buffer - * @param ui The integer to put - * - * @return UDS_SUCCESS or UDS_BUFFER_ERROR if there are fewer than 8 - * bytes available - **/ -int __must_check put_uint64_le_into_buffer(struct buffer *buffer, uint64_t ui); - -/** - * Get a series of 8 byte, little endian encoded integer from a buffer - * and advance the start pointer past them. - * - * @param buffer The buffer - * @param count The number of integers to get - * @param ui A pointer to hold the integers - * - * @return UDS_SUCCESS or UDS_BUFFER_ERROR if there is not enough data - * in the buffer - **/ -int __must_check -get_uint64_les_from_buffer(struct buffer *buffer, size_t count, uint64_t *ui); - -/** - * Put a series of 8 byte, little endian encoded integers into a buffer and - * advance the end pointer past them. - * - * @param buffer The buffer - * @param count The number of integers to put - * @param ui A pointer to the integers - * - * @return UDS_SUCCESS or UDS_BUFFER_ERROR if there is not enough space - * in the buffer - **/ -int __must_check -put_uint64_les_into_buffer(struct buffer *buffer, - size_t count, - const uint64_t *ui); - -#endif /* BUFFER_H */ diff --git a/uds/bufferPrivate.h b/uds/bufferPrivate.h deleted file mode 100644 index bdf0edd9..00000000 --- a/uds/bufferPrivate.h +++ /dev/null @@ -1,35 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/krusty/src/uds/bufferPrivate.h#3 $ - */ - -#ifndef BUFFER_PRIVATE_H -#define BUFFER_PRIVATE_H - -#include "common.h" - -struct buffer { - size_t start; - size_t end; - size_t length; - byte *data; - bool wrapped; -}; - -#endif /* BUFFER_PRIVATE_H */ diff --git a/uds/bufferedReader.c b/uds/bufferedReader.c deleted file mode 100644 index 80b860f0..00000000 --- a/uds/bufferedReader.c +++ /dev/null @@ -1,213 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/krusty/src/uds/bufferedReader.c#17 $ - */ - -#include "bufferedReader.h" - -#include "compiler.h" -#include "ioFactory.h" -#include "logger.h" -#include "memoryAlloc.h" -#include "numeric.h" - - -struct buffered_reader { - // IO factory owning the block device - struct io_factory *br_factory; - // The dm_bufio_client to read from - struct dm_bufio_client *br_client; - // The current dm_buffer - struct dm_buffer *br_buffer; - // The number of blocks that can be read from - sector_t br_limit; - // Number of the current block - sector_t br_block_number; - // Start of the buffer - byte *br_start; - // End of the data read from the buffer - byte *br_pointer; -}; - -/**********************************************************************/ -static void read_ahead(struct buffered_reader *br, sector_t block_number) -{ - if (block_number < br->br_limit) { - enum { MAX_READ_AHEAD = 4 }; - sector_t read_ahead = min((sector_t) MAX_READ_AHEAD, - br->br_limit - block_number); - dm_bufio_prefetch(br->br_client, block_number, read_ahead); - } -} - -/**********************************************************************/ -int make_buffered_reader(struct io_factory *factory, - struct dm_bufio_client *client, - sector_t block_limit, - struct buffered_reader **reader_ptr) -{ - struct buffered_reader *reader = NULL; - int result = - UDS_ALLOCATE(1, struct buffered_reader, "buffered reader", - &reader); - if (result != UDS_SUCCESS) { - return result; - } - - *reader = (struct buffered_reader){ - .br_factory = factory, - .br_client = client, - .br_buffer = NULL, - .br_limit = block_limit, - .br_block_number = 0, - .br_start = NULL, - .br_pointer = NULL, - }; - - read_ahead(reader, 0); - get_uds_io_factory(factory); - *reader_ptr = reader; - return UDS_SUCCESS; -} - -/**********************************************************************/ -void free_buffered_reader(struct buffered_reader *br) -{ - if (br == NULL) { - return; - } - if (br->br_buffer != NULL) { - dm_bufio_release(br->br_buffer); - } - dm_bufio_client_destroy(br->br_client); - put_uds_io_factory(br->br_factory); - UDS_FREE(br); -} - -/**********************************************************************/ -static int -position_reader(struct buffered_reader *br, sector_t block_number, off_t offset) -{ - if ((br->br_pointer == NULL) || (block_number != br->br_block_number)) { - struct dm_buffer *buffer = NULL; - void *data; - if (block_number >= br->br_limit) { - return UDS_OUT_OF_RANGE; - } - if (br->br_buffer != NULL) { - dm_bufio_release(br->br_buffer); - br->br_buffer = NULL; - } - data = dm_bufio_read(br->br_client, block_number, &buffer); - if (IS_ERR(data)) { - return -PTR_ERR(data); - } - br->br_buffer = buffer; - br->br_start = data; - if (block_number == br->br_block_number + 1) { - read_ahead(br, block_number + 1); - } - } - br->br_block_number = block_number; - br->br_pointer = br->br_start + offset; - return UDS_SUCCESS; -} - -/**********************************************************************/ -static size_t bytes_remaining_in_read_buffer(struct buffered_reader *br) -{ - return (br->br_pointer == NULL ? - 0 : - br->br_start + UDS_BLOCK_SIZE - br->br_pointer); -} - -/**********************************************************************/ -int read_from_buffered_reader(struct buffered_reader *br, - void *data, - size_t length) -{ - byte *dp = data; - int result = UDS_SUCCESS; - size_t avail, chunk; - while (length > 0) { - if (bytes_remaining_in_read_buffer(br) == 0) { - sector_t block_number = br->br_block_number; - if (br->br_pointer != NULL) { - ++block_number; - } - result = position_reader(br, block_number, 0); - if (result != UDS_SUCCESS) { - break; - } - } - - avail = bytes_remaining_in_read_buffer(br); - chunk = min(length, avail); - memcpy(dp, br->br_pointer, chunk); - length -= chunk; - dp += chunk; - br->br_pointer += chunk; - } - - if (((result == UDS_OUT_OF_RANGE) || (result == UDS_END_OF_FILE)) && - (dp - (byte *) data > 0)) { - result = UDS_SHORT_READ; - } - return result; -} - -/**********************************************************************/ -int verify_buffered_data(struct buffered_reader *br, - const void *value, - size_t length) -{ - int result; - size_t avail,chunk; - const byte *vp = value; - sector_t starting_block_number = br->br_block_number; - int starting_offset = br->br_pointer - br->br_start; - while (length > 0) { - if (bytes_remaining_in_read_buffer(br) == 0) { - sector_t block_number = br->br_block_number; - if (br->br_pointer != NULL) { - ++block_number; - } - result = position_reader(br, block_number, 0); - if (result != UDS_SUCCESS) { - position_reader(br, - starting_block_number, - starting_offset); - return UDS_CORRUPT_FILE; - } - } - - avail = bytes_remaining_in_read_buffer(br); - chunk = min(length, avail); - if (memcmp(vp, br->br_pointer, chunk) != 0) { - position_reader( - br, starting_block_number, starting_offset); - return UDS_CORRUPT_FILE; - } - length -= chunk; - vp += chunk; - br->br_pointer += chunk; - } - - return UDS_SUCCESS; -} diff --git a/uds/bufferedReader.h b/uds/bufferedReader.h deleted file mode 100644 index 48b43ba5..00000000 --- a/uds/bufferedReader.h +++ /dev/null @@ -1,89 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/krusty/src/uds/bufferedReader.h#9 $ - */ - -#ifndef BUFFERED_READER_H -#define BUFFERED_READER_H 1 - -#include "common.h" - -struct dm_bufio_client; -struct io_factory; - -/** - * The buffered reader allows efficient IO for IO regions, which may be - * file- or block-based. The internal buffer always reads aligned data - * from the underlying region. - **/ -struct buffered_reader; - -/** - * Make a new buffered reader. - * - * @param factory The IO factory creating the buffered reader. - * @param client The dm_bufio_client to read from. - * @param block_limit The number of blocks that may be read. - * @param reader_ptr The pointer to hold the newly allocated buffered reader - * - * @return UDS_SUCCESS or error code. - **/ -int __must_check make_buffered_reader(struct io_factory *factory, - struct dm_bufio_client *client, - sector_t block_limit, - struct buffered_reader **reader_ptr); - -/** - * Free a buffered reader. - * - * @param reader The buffered reader - **/ -void free_buffered_reader(struct buffered_reader *reader); - -/** - * Retrieve data from a buffered reader, reading from the region when needed. - * - * @param reader The buffered reader - * @param data The buffer to read data into - * @param length The length of the data to read - * - * @return UDS_SUCCESS or an error code. - **/ -int __must_check read_from_buffered_reader(struct buffered_reader *reader, - void *data, - size_t length); - -/** - * Verify that the data currently in the buffer matches the required value. - * - * @param reader The buffered reader. - * @param value The value that must match the buffer contents. - * @param length The length of the value that must match. - * - * @return UDS_SUCCESS or an error code, specifically UDS_CORRUPT_FILE - * if the required value fails to match. - * - * @note If the value matches, the matching contents are consumed. However, - * if the match fails, any buffer contents are left as is. - **/ -int __must_check verify_buffered_data(struct buffered_reader *reader, - const void *value, - size_t length); - -#endif // BUFFERED_READER_H diff --git a/uds/bufferedWriter.c b/uds/bufferedWriter.c deleted file mode 100644 index aa5b6a20..00000000 --- a/uds/bufferedWriter.c +++ /dev/null @@ -1,239 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/krusty/src/uds/bufferedWriter.c#22 $ - */ - -#include "bufferedWriter.h" - -#include "compiler.h" -#include "errors.h" -#include "ioFactory.h" -#include "logger.h" -#include "memoryAlloc.h" -#include "numeric.h" - - -struct buffered_writer { - // IO factory owning the block device - struct io_factory *bw_factory; - // The dm_bufio_client to write to - struct dm_bufio_client *bw_client; - // The current dm_buffer - struct dm_buffer *bw_buffer; - // The number of blocks that can be written to - sector_t bw_limit; - // Number of the current block - sector_t bw_block_number; - // Start of the buffer - byte *bw_start; - // End of the data written to the buffer - byte *bw_pointer; - // Error code - int bw_error; - // Have writes been done? - bool bw_used; -}; - -/**********************************************************************/ -int __must_check prepare_next_buffer(struct buffered_writer *bw) -{ - struct dm_buffer *buffer = NULL; - void *data; - if (bw->bw_block_number >= bw->bw_limit) { - bw->bw_error = UDS_OUT_OF_RANGE; - return UDS_OUT_OF_RANGE; - } - - data = dm_bufio_new(bw->bw_client, bw->bw_block_number, &buffer); - if (IS_ERR(data)) { - bw->bw_error = -PTR_ERR(data); - return bw->bw_error; - } - bw->bw_buffer = buffer; - bw->bw_start = data; - bw->bw_pointer = data; - return UDS_SUCCESS; -} - -/**********************************************************************/ -int flush_previous_buffer(struct buffered_writer *bw) -{ - if (bw->bw_buffer != NULL) { - if (bw->bw_error == UDS_SUCCESS) { - size_t avail = space_remaining_in_write_buffer(bw); - if (avail > 0) { - memset(bw->bw_pointer, 0, avail); - } - dm_bufio_mark_buffer_dirty(bw->bw_buffer); - } - dm_bufio_release(bw->bw_buffer); - bw->bw_buffer = NULL; - bw->bw_start = NULL; - bw->bw_pointer = NULL; - bw->bw_block_number++; - } - return bw->bw_error; -} - -/**********************************************************************/ -int make_buffered_writer(struct io_factory *factory, - struct dm_bufio_client *client, - sector_t block_limit, - struct buffered_writer **writer_ptr) -{ - struct buffered_writer *writer; - int result = - UDS_ALLOCATE(1, struct buffered_writer, "buffered writer", - &writer); - if (result != UDS_SUCCESS) { - return result; - } - - *writer = (struct buffered_writer){ - .bw_factory = factory, - .bw_client = client, - .bw_buffer = NULL, - .bw_limit = block_limit, - .bw_start = NULL, - .bw_pointer = NULL, - .bw_block_number = 0, - .bw_error = UDS_SUCCESS, - .bw_used = false, - }; - - get_uds_io_factory(factory); - *writer_ptr = writer; - return UDS_SUCCESS; -} - -/**********************************************************************/ -void free_buffered_writer(struct buffered_writer *bw) -{ - int result; - if (bw == NULL) { - return; - } - flush_previous_buffer(bw); - result = -dm_bufio_write_dirty_buffers(bw->bw_client); - if (result != UDS_SUCCESS) { - uds_log_warning_strerror(result, - "%s cannot sync storage", __func__); - } - dm_bufio_client_destroy(bw->bw_client); - put_uds_io_factory(bw->bw_factory); - UDS_FREE(bw); -} - -/**********************************************************************/ -static INLINE size_t space_used_in_buffer(struct buffered_writer *bw) -{ - return bw->bw_pointer - bw->bw_start; -} - -/**********************************************************************/ -size_t space_remaining_in_write_buffer(struct buffered_writer *bw) -{ - return UDS_BLOCK_SIZE - space_used_in_buffer(bw); -} - -/**********************************************************************/ -int write_to_buffered_writer(struct buffered_writer *bw, - const void *data, - size_t len) -{ - const byte *dp = data; - int result = UDS_SUCCESS; - size_t avail, chunk; - if (bw->bw_error != UDS_SUCCESS) { - return bw->bw_error; - } - - while ((len > 0) && (result == UDS_SUCCESS)) { - if (bw->bw_buffer == NULL) { - result = prepare_next_buffer(bw); - continue; - } - - avail = space_remaining_in_write_buffer(bw); - chunk = min(len, avail); - memcpy(bw->bw_pointer, dp, chunk); - len -= chunk; - dp += chunk; - bw->bw_pointer += chunk; - - if (space_remaining_in_write_buffer(bw) == 0) { - result = flush_buffered_writer(bw); - } - } - - bw->bw_used = true; - return result; -} - -/**********************************************************************/ -int write_zeros_to_buffered_writer(struct buffered_writer *bw, size_t len) -{ - int result = UDS_SUCCESS; - size_t avail, chunk; - if (bw->bw_error != UDS_SUCCESS) { - return bw->bw_error; - } - - while ((len > 0) && (result == UDS_SUCCESS)) { - if (bw->bw_buffer == NULL) { - result = prepare_next_buffer(bw); - continue; - } - - avail = space_remaining_in_write_buffer(bw); - chunk = min(len, avail); - memset(bw->bw_pointer, 0, chunk); - len -= chunk; - bw->bw_pointer += chunk; - - if (space_remaining_in_write_buffer(bw) == 0) { - result = flush_buffered_writer(bw); - } - } - - bw->bw_used = true; - return result; -} - -/**********************************************************************/ -int flush_buffered_writer(struct buffered_writer *bw) -{ - if (bw->bw_error != UDS_SUCCESS) { - return bw->bw_error; - } - - return flush_previous_buffer(bw); -} - -/**********************************************************************/ -bool was_buffered_writer_used(const struct buffered_writer *bw) -{ - return bw->bw_used; -} - -/**********************************************************************/ -void note_buffered_writer_used(struct buffered_writer *bw) -{ - bw->bw_used = true; -} diff --git a/uds/bufferedWriter.h b/uds/bufferedWriter.h deleted file mode 100644 index 94446cf2..00000000 --- a/uds/bufferedWriter.h +++ /dev/null @@ -1,123 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/krusty/src/uds/bufferedWriter.h#12 $ - */ - -#ifndef BUFFERED_WRITER_H -#define BUFFERED_WRITER_H 1 - -#include "common.h" - -struct dm_bufio_client; -struct io_factory; - -struct buffered_writer; - -/** - * Make a new buffered writer. - * - * @param factory The IO factory creating the buffered writer - * @param client The dm_bufio_client to write to. - * @param block_limit The number of blocks that may be written to. - * @param writer_ptr The new buffered writer goes here. - * - * @return UDS_SUCCESS or an error code. - **/ -int __must_check make_buffered_writer(struct io_factory *factory, - struct dm_bufio_client *client, - sector_t block_limit, - struct buffered_writer **writer_ptr); - -/** - * Free a buffered writer, without flushing. - * - * @param [in] buffer The buffered writer object. - **/ -void free_buffered_writer(struct buffered_writer *buffer); - -/** - * Append data to buffer, writing as needed. - * - * @param buffer The buffered writer object. - * @param data The data to write. - * @param len The length of the data written. - * - * @return UDS_SUCCESS or an error code. - * The error may reflect previous attempts to write - * or flush the buffer. Once a write or flush error - * occurs it is sticky. - **/ -int __must_check write_to_buffered_writer(struct buffered_writer *buffer, - const void *data, - size_t len); - -/** - * Zero data in the buffer, writing as needed. - * - * @param bw The buffered writer object. - * @param len The number of zero bytes to write. - * - * @return UDS_SUCCESS or an error code. - * The error may reflect previous attempts to write - * or flush the buffer. Once a write or flush error - * occurs it is sticky. - **/ -int __must_check write_zeros_to_buffered_writer(struct buffered_writer *bw, - size_t len); - -/** - * Flush any partial data from the buffer. - * - * @param buffer The buffered writer object. - * - * @return UDS_SUCCESS or an error code. - * The error may reflect previous attempts to write - * or flush the buffer. Once a write or flush error - * occurs it is sticky. - **/ -int __must_check flush_buffered_writer(struct buffered_writer *buffer); - -/** - * Return the size of the remaining space in the buffer (for testing) - * - * @param [in] buffer The buffered writer object. - * - * @return The number of available bytes in the buffer. - **/ -size_t __must_check -space_remaining_in_write_buffer(struct buffered_writer *buffer); - -/** - * Return whether the buffer was ever written to. - * - * @param buffer The buffered writer object. - * - * @return True if at least one call to write_to_buffered_writer - * was made. - **/ -bool __must_check was_buffered_writer_used(const struct buffered_writer *buffer); - -/** - * Note the buffer has been used. - * - * @param buffer The buffered writer object. - **/ -void note_buffered_writer_used(struct buffered_writer *buffer); - -#endif // BUFFERED_WRITER_H diff --git a/uds/cacheCounters.c b/uds/cacheCounters.c deleted file mode 100644 index 5537b92b..00000000 --- a/uds/cacheCounters.c +++ /dev/null @@ -1,94 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/krusty/src/uds/cacheCounters.c#11 $ - */ - -#include "cacheCounters.h" - -#include - -#include "compiler.h" -#include "errors.h" -#include "permassert.h" -#include "stringUtils.h" -#include "uds.h" - -/**********************************************************************/ -void increment_cache_counter(struct cache_counters *counters, - int probe_type, - enum cache_result_kind kind) -{ - struct cache_counts_by_kind *kind_counts; - uint64_t *my_counter; - enum cache_probe_type basic_probe_type = - probe_type & ~CACHE_PROBE_IGNORE_FAILURE; - int result = ASSERT(basic_probe_type <= CACHE_PROBE_RECORD_RETRY, - "invalid cache probe type %#x", - probe_type); - if (result != UDS_SUCCESS) { - return; - } - result = ASSERT(kind <= CACHE_RESULT_QUEUED, - "invalid cache probe result type %#x", - kind); - if (result != UDS_SUCCESS) { - return; - } - - if (((probe_type & CACHE_PROBE_IGNORE_FAILURE) != 0) && - (kind != CACHE_RESULT_HIT)) { - return; - } - - switch (basic_probe_type) { - case CACHE_PROBE_INDEX_FIRST: - kind_counts = &counters->first_time.index_page; - break; - case CACHE_PROBE_RECORD_FIRST: - kind_counts = &counters->first_time.record_page; - break; - case CACHE_PROBE_INDEX_RETRY: - kind_counts = &counters->retried.index_page; - break; - case CACHE_PROBE_RECORD_RETRY: - kind_counts = &counters->retried.record_page; - break; - default: - // Never used but the compiler hasn't figured that out. - return; - } - - switch (kind) { - case CACHE_RESULT_MISS: - my_counter = &kind_counts->misses; - break; - case CACHE_RESULT_QUEUED: - my_counter = &kind_counts->queued; - break; - case CACHE_RESULT_HIT: - my_counter = &kind_counts->hits; - break; - default: - // Never used but the compiler hasn't figured that out. - return; - } - // XXX Vile case makes many assumptions. Counters should be declared - // atomic. - atomic64_inc((atomic64_t *) my_counter); -} diff --git a/uds/cacheCounters.h b/uds/cacheCounters.h deleted file mode 100644 index cfd3fb2a..00000000 --- a/uds/cacheCounters.h +++ /dev/null @@ -1,119 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/krusty/src/uds/cacheCounters.h#8 $ - */ - -#ifndef CACHE_COUNTERS_H -#define CACHE_COUNTERS_H - -#include "typeDefs.h" - -/** - * Basic counts of hits and misses for a given type of cache probe. - **/ -struct cache_counts_by_kind { - /** Number of hits */ - uint64_t hits; - /** Number of misses */ - uint64_t misses; - /** Number of probes for data already queued for read */ - uint64_t queued; -}; - -/** - * The various types of cache probes we care about. - **/ -enum cache_probe_type { - /** First attempt to look up an index page, for a given request. */ - CACHE_PROBE_INDEX_FIRST = 0, - /** First attempt to look up a record page, for a given request. */ - CACHE_PROBE_RECORD_FIRST, - /** Second or later attempt to look up an index page, for a given - * request. - */ - CACHE_PROBE_INDEX_RETRY, - /** Second or later attempt to look up a record page, for a given - * request. - */ - CACHE_PROBE_RECORD_RETRY -}; - -enum { - /** Flag bit to indicate that failures shouldn't be recorded. */ - CACHE_PROBE_IGNORE_FAILURE = 128 -}; - -/** - * Result-type counts for both kinds of data pages in the page cache. - **/ -struct cache_counts_by_page_type { - /** His/miss counts for index pages. */ - struct cache_counts_by_kind index_page; - /** Hit/miss counts for record pages. */ - struct cache_counts_by_kind record_page; -}; - -/** - * All the counters used for an entry cache. - **/ -struct cache_counters { - // counters for the page cache - /** Hit/miss counts for the first attempt per request */ - struct cache_counts_by_page_type first_time; - /** Hit/miss counts when a second (or later) attempt is needed */ - struct cache_counts_by_page_type retried; - - /** Number of cache entry invalidations due to single-entry eviction */ - uint64_t evictions; - /** Number of cache entry invalidations due to chapter expiration */ - uint64_t expirations; - - // counters for the sparse chapter index cache - /** Hit/miss counts for the sparse cache chapter probes */ - struct cache_counts_by_kind sparse_chapters; - /** Hit/miss counts for the sparce cache name searches */ - struct cache_counts_by_kind sparse_searches; -}; - -/** - * Success/failure assessment of cache probe result. - **/ -enum cache_result_kind { - /** The requested entry was found in the cache */ - CACHE_RESULT_HIT, - /** The requested entry was not found in the cache */ - CACHE_RESULT_MISS, - /** The requested entry wasn't found in the cache but is queued for - * read - */ - CACHE_RESULT_QUEUED -}; - -/** - * Increment one of the cache counters. - * - * @param counters pointer to the counters - * @param probe_type type of access done - * @param kind result of probe - **/ -void increment_cache_counter(struct cache_counters *counters, - int probe_type, - enum cache_result_kind kind); - -#endif /* CACHE_COUNTERS_H */ diff --git a/uds/cachedChapterIndex.c b/uds/cachedChapterIndex.c deleted file mode 100644 index 257fb8e4..00000000 --- a/uds/cachedChapterIndex.c +++ /dev/null @@ -1,140 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/krusty/src/uds/cachedChapterIndex.c#21 $ - */ - -#include "cachedChapterIndex.h" - -#include "memoryAlloc.h" - -/**********************************************************************/ -int initialize_cached_chapter_index(struct cached_chapter_index *chapter, - const struct geometry *geometry) -{ - int result; - unsigned int i; - chapter->virtual_chapter = UINT64_MAX; - chapter->index_pages_count = geometry->index_pages_per_chapter; - - result = UDS_ALLOCATE(chapter->index_pages_count, - struct delta_index_page, - __func__, - &chapter->index_pages); - if (result != UDS_SUCCESS) { - return result; - } - - result = UDS_ALLOCATE(chapter->index_pages_count, - struct volume_page, - "sparse index volume pages", - &chapter->volume_pages); - if (result != UDS_SUCCESS) { - return result; - } - - for (i = 0; i < chapter->index_pages_count; i++) { - result = initialize_volume_page(geometry, - &chapter->volume_pages[i]); - if (result != UDS_SUCCESS) { - return result; - } - } - return UDS_SUCCESS; -} - -/**********************************************************************/ -void release_cached_chapter_index(struct cached_chapter_index *chapter) -{ - if (chapter->volume_pages != NULL) { - unsigned int i; - for (i = 0; i < chapter->index_pages_count; i++) { - release_volume_page(&chapter->volume_pages[i]); - } - } -} - -/**********************************************************************/ -void destroy_cached_chapter_index(struct cached_chapter_index *chapter) -{ - if (chapter->volume_pages != NULL) { - unsigned int i; - for (i = 0; i < chapter->index_pages_count; i++) { - destroy_volume_page(&chapter->volume_pages[i]); - } - } - UDS_FREE(chapter->index_pages); - UDS_FREE(chapter->volume_pages); -} - -/**********************************************************************/ -int cache_chapter_index(struct cached_chapter_index *chapter, - uint64_t virtual_chapter, - const struct volume *volume) -{ - int result; - // Mark the cached chapter as unused in case the update fails midway. - chapter->virtual_chapter = UINT64_MAX; - - // Read all the page data and initialize the entire delta_index_page - // array. (It's not safe for the zone threads to do it lazily--they'll - // race.) - result = read_chapter_index_from_volume(volume, - virtual_chapter, - chapter->volume_pages, - chapter->index_pages); - if (result != UDS_SUCCESS) { - return result; - } - - // Reset all chapter counter values to zero. - chapter->counters.search_hits = 0; - chapter->counters.search_misses = 0; - chapter->counters.consecutive_misses = 0; - - // Mark the entry as valid--it's now in the cache. - chapter->virtual_chapter = virtual_chapter; - chapter->skip_search = false; - - return UDS_SUCCESS; -} - -/**********************************************************************/ -int search_cached_chapter_index(struct cached_chapter_index *chapter, - const struct geometry *geometry, - const struct index_page_map *index_page_map, - const struct uds_chunk_name *name, - int *record_page_ptr) -{ - // Find the index_page_number in the chapter that would have the chunk - // name. - unsigned int physical_chapter = - map_to_physical_chapter(geometry, chapter->virtual_chapter); - unsigned int index_page_number; - int result = find_index_page_number(index_page_map, name, - physical_chapter, - &index_page_number); - if (result != UDS_SUCCESS) { - return result; - } - - return search_chapter_index_page(&chapter->index_pages[index_page_number], - geometry, - name, - record_page_ptr); -} diff --git a/uds/cachedChapterIndex.h b/uds/cachedChapterIndex.h deleted file mode 100644 index b6c81c66..00000000 --- a/uds/cachedChapterIndex.h +++ /dev/null @@ -1,212 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/krusty/src/uds/cachedChapterIndex.h#16 $ - */ - -#ifndef CACHED_CHAPTER_INDEX_H -#define CACHED_CHAPTER_INDEX_H - -#include "chapterIndex.h" -#include "common.h" -#include "compiler.h" -#include "cpu.h" -#include "geometry.h" -#include "indexPageMap.h" -#include "typeDefs.h" -#include "volume.h" -#include "volumeStore.h" - -/** - * These counters are essentially fields of the struct cached_chapter_index, - * but are segregated into this structure because they are frequently modified. - * They are grouped and aligned to keep them on different cache lines from the - * chapter fields that are accessed far more often than they are updated. - **/ -struct __attribute__((aligned(CACHE_LINE_BYTES))) cached_index_counters { - /** the total number of search hits since this chapter was cached */ - uint64_t search_hits; - - /** the total number of search misses since this chapter was cached */ - uint64_t search_misses; - - /** the number of consecutive search misses since the last cache hit */ - uint64_t consecutive_misses; -}; - -/** - * struct cached_chapter_index is the structure for a cache entry, representing - * a single cached chapter index in the sparse chapter index cache. - **/ -struct __attribute__((aligned(CACHE_LINE_BYTES))) cached_chapter_index { - /* - * The virtual chapter number of the cached chapter index. UINT64_MAX - * means this cache entry is unused. Must only be modified in the - * critical section in updateSparseCache(). - */ - uint64_t virtual_chapter; - - /* The number of index pages in a chapter */ - unsigned int index_pages_count; - - /* - * This flag is mutable between cache updates, but it rarely changes - * and is frequently accessed, so it groups with the immutable fields. - * - * If set, skip the chapter when searching the entire cache. This flag - * is just a performance optimization. If we do not see a recent - * change to it, it will be corrected when we pass through a memory - * barrier while getting the next request from the queue. So we may do - * one extra search of the chapter index, or miss one deduplication - * opportunity. - */ - bool skip_search; - - // These pointers are immutable during the life of the cache. The - // contents of the arrays change when the cache entry is replaced. - - /* pointer to a cache-aligned array of ChapterIndexPages */ - struct delta_index_page *index_pages; - - /* pointer to an array of volume pages containing the index pages */ - struct volume_page *volume_pages; - - // The cache-aligned counters change often and are placed at the end of - // the structure to prevent false sharing with the more stable fields - // above. - - /* counter values updated by the thread servicing zone zero */ - struct cached_index_counters counters; -}; - -/** - * Initialize a struct cached_chapter_index, allocating the memory for the - * array of ChapterIndexPages and the raw index page data. The chapter index - * will be marked as unused (virtual_chapter == UINT64_MAX). - * - * @param chapter the chapter index cache entry to initialize - * @param geometry the geometry governing the volume - **/ -int __must_check -initialize_cached_chapter_index(struct cached_chapter_index *chapter, - const struct geometry *geometry); - -/** - * Release the all cached page data for a cached_chapter_index. - * - * @param chapter the chapter index cache entry to release - **/ -void release_cached_chapter_index(struct cached_chapter_index *chapter); - -/** - * Destroy a cached_chapter_index, freeing the memory allocated for the - * ChapterIndexPages and raw index page data. - * - * @param chapter the chapter index cache entry to destroy - **/ -void destroy_cached_chapter_index(struct cached_chapter_index *chapter); - -/** - * Assign a new value to the skip_search flag of a cached chapter index. - * - * @param chapter the chapter index cache entry to modify - * @param skip_search the new value of the skip_search falg - **/ -static INLINE void set_skip_search(struct cached_chapter_index *chapter, - bool skip_search) -{ - // Explicitly check if the field is set so we don't keep dirtying the - // memory cache line on continued search hits. - if (READ_ONCE(chapter->skip_search) != skip_search) { - WRITE_ONCE(chapter->skip_search, skip_search); - } -} - -/** - * Check if a cached sparse chapter index should be skipped over in the search - * for a chunk name. Filters out unused, invalid, disabled, and irrelevant - * cache entries. - * - * @param zone the zone doing the check - * @param chapter the cache entry search candidate - * @param virtual_chapter the virtual_chapter containing a hook, or UINT64_MAX - * if searching the whole cache for a non-hook - * - * @return true if the provided chapter index should be skipped - **/ -static INLINE bool -should_skip_chapter_index(const struct index_zone *zone, - const struct cached_chapter_index *chapter, - uint64_t virtual_chapter) -{ - // Don't search unused entries (contents undefined) or invalid entries - // (the chapter is no longer the zone's view of the volume). - if ((chapter->virtual_chapter == UINT64_MAX) || - (chapter->virtual_chapter < zone->oldest_virtual_chapter)) { - return true; - } - - if (virtual_chapter != UINT64_MAX) { - // If the caller specified a virtual chapter, only search the - // cache entry containing that chapter. - return (virtual_chapter != chapter->virtual_chapter); - } else { - // When searching the entire cache, save time by skipping over - // chapters that have had too many consecutive misses. - return READ_ONCE(chapter->skip_search); - } -} - -/** - * Cache a chapter index, reading all the index pages from the volume and - * initializing the array of ChapterIndexPages in the cache entry to represent - * them. The virtual_chapter field of the cache entry will be set to UINT64_MAX - * if there is any error since the remaining mutable fields will be in an - * undefined state. - * - * @param chapter the chapter index cache entry to replace - * @param virtual_chapter the virtual chapter number of the index to read - * @param volume the volume containing the chapter index - * - * @return UDS_SUCCESS or an error code - **/ -int __must_check cache_chapter_index(struct cached_chapter_index *chapter, - uint64_t virtual_chapter, - const struct volume *volume); - -/** - * Search a single cached sparse chapter index for a chunk name, returning the - * record page number that may contain the name. - * - * @param [in] chapter the cache entry for the chapter to search - * @param [in] geometry the geometry governing the volume - * @param [in] index_page_map the index page number map for the volume - * @param [in] name the chunk name to search for - * @param [out] record_page_ptr the record page number of a match, else - * NO_CHAPTER_INDEX_ENTRY if nothing matched - * - * @return UDS_SUCCESS or an error code - **/ -int __must_check -search_cached_chapter_index(struct cached_chapter_index *chapter, - const struct geometry *geometry, - const struct index_page_map *index_page_map, - const struct uds_chunk_name *name, - int *record_page_ptr); - -#endif /* CACHED_CHAPTER_INDEX_H */ diff --git a/uds/chapterIndex.c b/uds/chapterIndex.c deleted file mode 100644 index be48aada..00000000 --- a/uds/chapterIndex.c +++ /dev/null @@ -1,336 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/krusty/src/uds/chapterIndex.c#20 $ - */ - -#include "chapterIndex.h" - -#include "compiler.h" -#include "errors.h" -#include "hashUtils.h" -#include "logger.h" -#include "memoryAlloc.h" -#include "permassert.h" -#include "uds.h" - - -/**********************************************************************/ -int make_open_chapter_index(struct open_chapter_index **open_chapter_index, - const struct geometry *geometry, - uint64_t volume_nonce) -{ - size_t memory_size; - - int result = UDS_ALLOCATE(1, - struct open_chapter_index, - "open chapter index", - open_chapter_index); - if (result != UDS_SUCCESS) { - return result; - } - - // The delta index will rebalance delta lists when memory gets tight, - // so give the chapter index one extra page. - memory_size = (geometry->index_pages_per_chapter + 1) * - geometry->bytes_per_page; - (*open_chapter_index)->geometry = geometry; - (*open_chapter_index)->volume_nonce = volume_nonce; - result = initialize_delta_index(&(*open_chapter_index)->delta_index, - 1, - geometry->delta_lists_per_chapter, - geometry->chapter_mean_delta, - geometry->chapter_payload_bits, - memory_size); - if (result != UDS_SUCCESS) { - UDS_FREE(*open_chapter_index); - *open_chapter_index = NULL; - } - return result; -} - -/**********************************************************************/ -void free_open_chapter_index(struct open_chapter_index *open_chapter_index) -{ - if (open_chapter_index == NULL) { - return; - } - - - uninitialize_delta_index(&open_chapter_index->delta_index); - UDS_FREE(open_chapter_index); -} - -/**********************************************************************/ -void empty_open_chapter_index(struct open_chapter_index *open_chapter_index, - uint64_t virtual_chapter_number) -{ - empty_delta_index(&open_chapter_index->delta_index); - open_chapter_index->virtual_chapter_number = virtual_chapter_number; -} - -/** - * Check whether a delta list entry reflects a successful search for a given - * address. - * - * @param entry the delta list entry from the search - * @param address the address of the desired entry - * - * @return true iff the address was found - **/ -static INLINE bool was_entry_found(const struct delta_index_entry *entry, - unsigned int address) -{ - return (!entry->at_end && (entry->key == address)); -} - -/**********************************************************************/ -int put_open_chapter_index_record(struct open_chapter_index *open_chapter_index, - const struct uds_chunk_name *name, - unsigned int page_number) -{ - struct delta_index_entry entry; - unsigned int address; - bool found; - const struct geometry *geometry = open_chapter_index->geometry; - int result = - ASSERT_WITH_ERROR_CODE(page_number < - geometry->record_pages_per_chapter, - UDS_INVALID_ARGUMENT, - "Page number within chapter (%u) exceeds the maximum value %u", - page_number, - geometry->record_pages_per_chapter); - if (result != UDS_SUCCESS) { - return result; - } - - address = hash_to_chapter_delta_address(name, geometry); - result = get_delta_index_entry(&open_chapter_index->delta_index, - hash_to_chapter_delta_list(name, - geometry), - address, - name->name, - false, - &entry); - if (result != UDS_SUCCESS) { - return result; - } - found = was_entry_found(&entry, address); - result = ASSERT_WITH_ERROR_CODE(!(found && entry.is_collision), - UDS_BAD_STATE, - "Chunk appears more than once in chapter %llu", - (unsigned long long) open_chapter_index->virtual_chapter_number); - if (result != UDS_SUCCESS) { - return result; - } - return put_delta_index_entry(&entry, address, page_number, - (found ? name->name : NULL)); -} - -/**********************************************************************/ -int pack_open_chapter_index_page(struct open_chapter_index *open_chapter_index, - byte *memory, - unsigned int first_list, - bool last_page, - unsigned int *num_lists) -{ - struct delta_index *delta_index = &open_chapter_index->delta_index; - const struct geometry *geometry = open_chapter_index->geometry; - unsigned int removals = 0; - struct delta_index_entry entry; - int list_number; - for (;;) { - int result = - pack_delta_index_page(delta_index, - open_chapter_index->volume_nonce, - memory, - geometry->bytes_per_page, - open_chapter_index->virtual_chapter_number, - first_list, - num_lists); - if (result != UDS_SUCCESS) { - return result; - } - if ((first_list + *num_lists) == - geometry->delta_lists_per_chapter) { - // All lists are packed - break; - } else if (*num_lists == 0) { - // The next delta list does not fit on a page. This - // delta list will be removed. - } else if (last_page) { - /* - * This is the last page and there are lists left - * unpacked, but all of the remaining lists must fit on - * the page. Find a list that contains entries and - * remove the entire list. Try the first list that does - * not fit. If it is empty, we will select the last list - * that already fits and has any entries. - */ - } else { - // This page is done - break; - } - if (removals == 0) { - struct delta_index_stats stats; - get_delta_index_stats(delta_index, &stats); - uds_log_warning("The chapter index for chapter %llu contains %ld entries with %ld collisions", - (unsigned long long) open_chapter_index->virtual_chapter_number, - stats.record_count, - stats.collision_count); - } - - list_number = *num_lists; - do { - if (list_number < 0) { - return UDS_OVERFLOW; - } - result = start_delta_index_search(delta_index, - first_list + - list_number--, - 0, - false, - &entry); - if (result != UDS_SUCCESS) { - return result; - } - result = next_delta_index_entry(&entry); - if (result != UDS_SUCCESS) { - return result; - } - } while (entry.at_end); - do { - result = remove_delta_index_entry(&entry); - if (result != UDS_SUCCESS) { - return result; - } - removals++; - } while (!entry.at_end); - } - if (removals > 0) { - uds_log_warning("To avoid chapter index page overflow in chapter %llu, %u entries were removed from the chapter index", - (unsigned long long) open_chapter_index->virtual_chapter_number, - removals); - } - return UDS_SUCCESS; -} - -/**********************************************************************/ -int get_open_chapter_index_size(struct open_chapter_index *open_chapter_index) -{ - struct delta_index_stats stats; - get_delta_index_stats(&open_chapter_index->delta_index, &stats); - return stats.record_count; -} - -/**********************************************************************/ -size_t -get_open_chapter_index_memory_allocated(struct open_chapter_index *open_chapter_index) -{ - struct delta_index_stats stats; - get_delta_index_stats(&open_chapter_index->delta_index, &stats); - return stats.memory_allocated + sizeof(struct open_chapter_index); -} - -/**********************************************************************/ -int initialize_chapter_index_page(struct delta_index_page *chapter_index_page, - const struct geometry *geometry, - byte *index_page, - uint64_t volume_nonce) -{ - return initialize_delta_index_page(chapter_index_page, - volume_nonce, - geometry->chapter_mean_delta, - geometry->chapter_payload_bits, - index_page, - geometry->bytes_per_page); -} - -/**********************************************************************/ -int validate_chapter_index_page(const struct delta_index_page *chapter_index_page, - const struct geometry *geometry) -{ - const struct delta_index *delta_index = &chapter_index_page->delta_index; - unsigned int first = chapter_index_page->lowest_list_number; - unsigned int last = chapter_index_page->highest_list_number; - // We walk every delta list from start to finish. - unsigned int list_number; - for (list_number = first; list_number <= last; list_number++) { - struct delta_index_entry entry; - int result = - start_delta_index_search(delta_index, - list_number - first, - 0, true, &entry); - if (result != UDS_SUCCESS) { - return result; - } - for (;;) { - result = next_delta_index_entry(&entry); - if (result != UDS_SUCCESS) { - if (result == UDS_CORRUPT_DATA) { - // A random bit stream is highly likely - // to arrive here when we go past the - // end of the delta list - return UDS_CORRUPT_COMPONENT; - } - return result; - } - if (entry.at_end) { - break; - } - // Also make sure that the record page field contains a - // plausible value - if (get_delta_entry_value(&entry) >= - geometry->record_pages_per_chapter) { - // Do not log this as an error. It happens in - // normal operation when we are doing a rebuild - // but haven't written the entire volume once. - return UDS_CORRUPT_COMPONENT; - } - } - } - return UDS_SUCCESS; -} - -/**********************************************************************/ -int search_chapter_index_page(struct delta_index_page *chapter_index_page, - const struct geometry *geometry, - const struct uds_chunk_name *name, - int *record_page_ptr) -{ - struct delta_index *delta_index = &chapter_index_page->delta_index; - unsigned int address = hash_to_chapter_delta_address(name, geometry); - unsigned int delta_list_number = - hash_to_chapter_delta_list(name, geometry); - unsigned int sub_list_number = - delta_list_number - chapter_index_page->lowest_list_number; - struct delta_index_entry entry; - int result = - get_delta_index_entry(delta_index, sub_list_number, address, - name->name, true, &entry); - if (result != UDS_SUCCESS) { - return result; - } - - if (was_entry_found(&entry, address)) { - *record_page_ptr = get_delta_entry_value(&entry); - } else { - *record_page_ptr = NO_CHAPTER_INDEX_ENTRY; - } - return UDS_SUCCESS; -} diff --git a/uds/chapterIndex.h b/uds/chapterIndex.h deleted file mode 100644 index a7a450c3..00000000 --- a/uds/chapterIndex.h +++ /dev/null @@ -1,185 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/krusty/src/uds/chapterIndex.h#12 $ - */ - -#ifndef CHAPTER_INDEX_H -#define CHAPTER_INDEX_H 1 - -#include "deltaIndex.h" -#include "geometry.h" - -enum { - // The value returned as the record page number when an entry is not - // found - // in the chapter index. - NO_CHAPTER_INDEX_ENTRY = -1 -}; - -struct open_chapter_index { - const struct geometry *geometry; - struct delta_index delta_index; - uint64_t virtual_chapter_number; - uint64_t volume_nonce; -}; - - -/** - * Make a new open chapter index. - * - * @param open_chapter_index Location to hold new open chapter index pointer - * @param geometry The geometry - * @param volume_nonce The volume nonce - * - * @return error code or UDS_SUCCESS - **/ -int __must_check -make_open_chapter_index(struct open_chapter_index **open_chapter_index, - const struct geometry *geometry, - uint64_t volume_nonce); - -/** - * Terminate and clean up an open chapter index. - * - * @param open_chapter_index The open chapter index to terminate - **/ -void free_open_chapter_index(struct open_chapter_index *open_chapter_index); - -/** - * Empty an open chapter index, and prepare it for writing a new virtual - * chapter. - * - * @param open_chapter_index The open chapter index to empty - * @param virtual_chapter_number The virtual chapter number - **/ -void empty_open_chapter_index(struct open_chapter_index *open_chapter_index, - uint64_t virtual_chapter_number); - -/** - * Create a new record in an open chapter index, associating a chunk name with - * the number of the record page containing the metadata for the chunk. - * - * @param open_chapter_index The open chapter index - * @param name The chunk name - * @param page_number The number of the record page containing the name - * - * @return UDS_SUCCESS or an error code - **/ -int __must_check -put_open_chapter_index_record(struct open_chapter_index *open_chapter_index, - const struct uds_chunk_name *name, - unsigned int page_number); - -/** - * Pack a section of an open chapter index into a chapter index page. A - * range of delta lists (starting with a specified list index) is copied - * from the open chapter index into a memory page. The number of lists - * copied onto the page is returned to the caller. - * - * @param open_chapter_index The open chapter index - * @param memory The memory page to use - * @param first_list The first delta list number to be copied - * @param last_page If true, this is the last page of the chapter - * index and all the remaining lists must be packed - * onto this page - * @param num_lists The number of delta lists that were copied - * - * @return error code or UDS_SUCCESS. On UDS_SUCCESS, the num_lists - * argument contains the number of lists copied. - **/ -int __must_check -pack_open_chapter_index_page(struct open_chapter_index *open_chapter_index, - byte *memory, - unsigned int first_list, - bool last_page, - unsigned int *num_lists); - -/** - * Get the number of records in an open chapter index. - * - * @param open_chapter_index The open chapter index - * - * @return The number of records - **/ -int __must_check -get_open_chapter_index_size(struct open_chapter_index *open_chapter_index); - -/** - * Get the number of bytes allocated for the open chapter index. - * - * @param open_chapter_index The open chapter index - * - * @return the number of bytes allocated - **/ -size_t -get_open_chapter_index_memory_allocated(struct open_chapter_index *open_chapter_index); - -/** - * Make a new chapter index page, initializing it with the data from the - * given buffer. - * - * @param chapter_index_page The new chapter index page - * @param geometry The geometry - * @param index_page The memory page to use - * @param volume_nonce If non-zero, the volume nonce to verify - * - * @return UDS_SUCCESS or an error code - **/ -int __must_check -initialize_chapter_index_page(struct delta_index_page *chapter_index_page, - const struct geometry *geometry, - byte *index_page, - uint64_t volume_nonce); - -/** - * Validate a chapter index page. This is called at rebuild time to ensure - * that the volume file contains a coherent chapter index. - * - * @param chapter_index_page The chapter index page - * @param geometry The geometry of the volume - * - * @return The result code: - * UDS_SUCCESS for a good chapter index page - * UDS_CORRUPT_COMPONENT if the chapter index code detects invalid data - * UDS_CORRUPT_DATA if there is a problem in a delta list bit stream - * UDS_BAD_STATE if the code follows an invalid code path - **/ -int __must_check -validate_chapter_index_page(const struct delta_index_page *chapter_index_page, - const struct geometry *geometry); - -/** - * Search a chapter index page for a chunk name, returning the record page - * number that may contain the name. - * - * @param [in] chapter_index_page The chapter index page - * @param [in] geometry The geometry of the volume - * @param [in] name The chunk name - * @param [out] record_page_ptr The record page number - * or NO_CHAPTER_INDEX_ENTRY if not found - * - * @return UDS_SUCCESS or an error code - **/ -int __must_check -search_chapter_index_page(struct delta_index_page *chapter_index_page, - const struct geometry *geometry, - const struct uds_chunk_name *name, - int *record_page_ptr); - -#endif /* CHAPTER_INDEX_H */ diff --git a/uds/chapterWriter.c b/uds/chapterWriter.c deleted file mode 100644 index 233fad9d..00000000 --- a/uds/chapterWriter.c +++ /dev/null @@ -1,292 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/krusty/src/uds/chapterWriter.c#36 $ - */ - -#include "chapterWriter.h" - -#include "errors.h" -#include "index.h" -#include "indexCheckpoint.h" -#include "indexComponent.h" -#include "logger.h" -#include "memoryAlloc.h" -#include "openChapter.h" -#include "uds-threads.h" - - -struct chapter_writer { - /* The index to which we belong */ - struct uds_index *index; - /* The thread to do the writing */ - struct thread *thread; - /* lock protecting the following fields */ - struct mutex mutex; - /* condition signalled on state changes */ - struct cond_var cond; - /* Set to true to stop the thread */ - bool stop; - /* The result from the most recent write */ - int result; - /* The number of bytes allocated by the chapter writer */ - size_t memory_allocated; - /* The number of zones which have submitted a chapter for writing */ - unsigned int zones_to_write; - /* Open chapter index used by close_open_chapter() */ - struct open_chapter_index *open_chapter_index; - /* Collated records used by close_open_chapter() */ - struct uds_chunk_record *collated_records; - /* The chapters to write (one per zone) */ - struct open_chapter_zone *chapters[]; -}; - -/** - * This is the driver function for the writer thread. It loops until - * terminated, waiting for a chapter to provided to close. - **/ -static void close_chapters(void *arg) -{ - int result; - struct chapter_writer *writer = arg; - uds_log_debug("chapter writer starting"); - uds_lock_mutex(&writer->mutex); - for (;;) { - while (writer->zones_to_write < writer->index->zone_count) { - if (writer->stop && (writer->zones_to_write == 0)) { - // We've been told to stop, and all of the - // zones are in the same open chapter, so we - // can exit now. - uds_unlock_mutex(&writer->mutex); - uds_log_debug("chapter writer stopping"); - return; - } - uds_wait_cond(&writer->cond, &writer->mutex); - } - - /* - * Release the lock while closing a chapter. We probably don't - * need to do this, but it seems safer in principle. It's OK to - * access the chapter and chapterNumber fields without the lock - * since those aren't allowed to change until we're done. - */ - uds_unlock_mutex(&writer->mutex); - - if (writer->index->has_saved_open_chapter) { - struct index_component *oc; - writer->index->has_saved_open_chapter = false; - /* - * Remove the saved open chapter as that chapter is - * about to be written to the volume. This matters the - * first time we close the open chapter after loading - * from a clean shutdown, or after doing a clean save. - */ - oc = find_index_component(writer->index->state, - &OPEN_CHAPTER_INFO); - result = discard_index_component(oc); - if (result == UDS_SUCCESS) { - uds_log_debug("Discarding saved open chapter"); - } - } - - result = - close_open_chapter(writer->chapters, - writer->index->zone_count, - writer->index->volume, - writer->open_chapter_index, - writer->collated_records, - writer->index->newest_virtual_chapter); - - if (result == UDS_SUCCESS) { - result = process_chapter_writer_checkpoint_saves(writer->index); - } - - - uds_lock_mutex(&writer->mutex); - // Note that the index is totally finished with the writing - // chapter - advance_active_chapters(writer->index); - writer->result = result; - writer->zones_to_write = 0; - uds_broadcast_cond(&writer->cond); - } -} - -/**********************************************************************/ -int make_chapter_writer(struct uds_index *index, - struct chapter_writer **writer_ptr) -{ - size_t open_chapter_index_memory_allocated; - struct chapter_writer *writer; - size_t collated_records_size = - (sizeof(struct uds_chunk_record) * - (1 + index->volume->geometry->records_per_chapter)); - int result = UDS_ALLOCATE_EXTENDED(struct chapter_writer, - index->zone_count, - struct open_chapter_zone *, - "Chapter Writer", - &writer); - if (result != UDS_SUCCESS) { - return result; - } - writer->index = index; - - result = uds_init_mutex(&writer->mutex); - if (result != UDS_SUCCESS) { - UDS_FREE(writer); - return result; - } - result = uds_init_cond(&writer->cond); - if (result != UDS_SUCCESS) { - uds_destroy_mutex(&writer->mutex); - UDS_FREE(writer); - return result; - } - - // Now that we have the mutex+cond, it is safe to call - // free_chapter_writer. - result = uds_allocate_cache_aligned(collated_records_size, - "collated records", - &writer->collated_records); - if (result != UDS_SUCCESS) { - free_chapter_writer(writer); - return result; - } - result = make_open_chapter_index(&writer->open_chapter_index, - index->volume->geometry, - index->volume->nonce); - if (result != UDS_SUCCESS) { - free_chapter_writer(writer); - return result; - } - - open_chapter_index_memory_allocated = - get_open_chapter_index_memory_allocated( - writer->open_chapter_index); - writer->memory_allocated = - (sizeof(struct chapter_writer) + - index->zone_count * sizeof(struct open_chapter_zone *) + - collated_records_size + open_chapter_index_memory_allocated); - - // We're initialized, so now it's safe to start the writer thread. - result = uds_create_thread(close_chapters, writer, "writer", - &writer->thread); - if (result != UDS_SUCCESS) { - free_chapter_writer(writer); - return result; - } - - *writer_ptr = writer; - return UDS_SUCCESS; -} - -/**********************************************************************/ -void free_chapter_writer(struct chapter_writer *writer) -{ - int result __always_unused; - if (writer == NULL) { - return; - } - - result = stop_chapter_writer(writer); - uds_destroy_mutex(&writer->mutex); - uds_destroy_cond(&writer->cond); - free_open_chapter_index(writer->open_chapter_index); - UDS_FREE(writer->collated_records); - UDS_FREE(writer); -} - -/**********************************************************************/ -unsigned int start_closing_chapter(struct chapter_writer *writer, - unsigned int zone_number, - struct open_chapter_zone *chapter) -{ - unsigned int finished_zones; - uds_lock_mutex(&writer->mutex); - finished_zones = ++writer->zones_to_write; - writer->chapters[zone_number] = chapter; - uds_broadcast_cond(&writer->cond); - uds_unlock_mutex(&writer->mutex); - - return finished_zones; -} - -/**********************************************************************/ -int finish_previous_chapter(struct chapter_writer *writer, - uint64_t current_chapter_number) -{ - int result; - uds_lock_mutex(&writer->mutex); - while (writer->index->newest_virtual_chapter < - current_chapter_number) { - uds_wait_cond(&writer->cond, &writer->mutex); - } - result = writer->result; - uds_unlock_mutex(&writer->mutex); - - if (result != UDS_SUCCESS) { - return uds_log_error_strerror(result, - "Writing of previous open chapter failed"); - } - return UDS_SUCCESS; -} - -/**********************************************************************/ -void wait_for_idle_chapter_writer(struct chapter_writer *writer) -{ - uds_lock_mutex(&writer->mutex); - while (writer->zones_to_write > 0) { - // The chapter writer is probably writing a chapter. If it is - // not, it will soon wake up and write a chapter. - uds_wait_cond(&writer->cond, &writer->mutex); - } - uds_unlock_mutex(&writer->mutex); -} - -/**********************************************************************/ -int stop_chapter_writer(struct chapter_writer *writer) -{ - int result; - struct thread *writer_thread = 0; - - uds_lock_mutex(&writer->mutex); - if (writer->thread != 0) { - writer_thread = writer->thread; - writer->thread = 0; - writer->stop = true; - uds_broadcast_cond(&writer->cond); - } - result = writer->result; - uds_unlock_mutex(&writer->mutex); - - if (writer_thread != 0) { - uds_join_threads(writer_thread); - } - - if (result != UDS_SUCCESS) { - return uds_log_error_strerror(result, - "Writing of previous open chapter failed"); - } - return UDS_SUCCESS; -} - -/**********************************************************************/ -size_t get_chapter_writer_memory_allocated(struct chapter_writer *writer) -{ - return writer->memory_allocated; -} diff --git a/uds/chapterWriter.h b/uds/chapterWriter.h deleted file mode 100644 index 19b36b80..00000000 --- a/uds/chapterWriter.h +++ /dev/null @@ -1,107 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/krusty/src/uds/chapterWriter.h#10 $ - */ - -#ifndef CHAPTER_WRITER_H -#define CHAPTER_WRITER_H - -#include - -#include "openChapterZone.h" - -struct chapter_writer; - -// This opaque declaration breaks the dependency loop with index.h -struct uds_index; - - -/** - * Create a chapter writer and start its thread. - * - * @param index the index containing the chapters to be written - * @param writer_ptr pointer to hold the new writer - * - * @return UDS_SUCCESS or an error code - **/ -int __must_check make_chapter_writer(struct uds_index *index, - struct chapter_writer **writer_ptr); - -/** - * Free a chapter writer, waiting for its thread to finish. - * - * @param writer the chapter writer to destroy - **/ -void free_chapter_writer(struct chapter_writer *writer); - -/** - * Asychronously close and write a chapter by passing it to the writer - * thread. Writing won't start until all zones have submitted a chapter. - * - * @param writer the chapter writer - * @param zone_number the number of the zone submitting a chapter - * @param chapter the chapter to write - * - * @return The number of zones which have submitted the current chapter - **/ -unsigned int __must_check -start_closing_chapter(struct chapter_writer *writer, - unsigned int zone_number, - struct open_chapter_zone *chapter); - -/** - * Wait for the chapter writer thread to finish closing the chapter previous - * to the one specified. - * - * @param writer the chapter writer - * @param current_chapter_number the current chapter number - * - * @return UDS_SUCCESS or an error code from the most recent write - * request - **/ -int __must_check finish_previous_chapter(struct chapter_writer *writer, - uint64_t current_chapter_number); - -/** - * Wait for the chapter writer thread to finish all writes to storage. - * - * @param writer the chapter writer - **/ -void wait_for_idle_chapter_writer(struct chapter_writer *writer); - -/** - * Stop the chapter writer and wait for it to finish. - * - * @param writer the chapter writer to stop - * - * @return UDS_SUCCESS or an error code from the most recent write - * request - **/ -int __must_check stop_chapter_writer(struct chapter_writer *writer); - -/** - * Get the number of bytes allocated for the chapter writer. - * - * @param writer the chapter writer - * - * @return the number of bytes allocated - **/ -size_t get_chapter_writer_memory_allocated(struct chapter_writer *writer); - -#endif /* CHAPTER_WRITER_H */ diff --git a/uds/common.h b/uds/common.h deleted file mode 100644 index 55b34109..00000000 --- a/uds/common.h +++ /dev/null @@ -1,42 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/krusty/src/uds/common.h#9 $ - */ - -#ifndef COMMON_H -#define COMMON_H - -#include "stringUtils.h" -#include "typeDefs.h" -#include "uds.h" - -enum { - KILOBYTE = 1024, - MEGABYTE = KILOBYTE * KILOBYTE, - GIGABYTE = KILOBYTE * MEGABYTE -}; - -struct uds_chunk_data; - -struct uds_chunk_record { - struct uds_chunk_name name; - struct uds_chunk_data data; -}; - -#endif /* COMMON_H */ diff --git a/uds/compiler.h b/uds/compiler.h deleted file mode 100644 index 631a69cf..00000000 --- a/uds/compiler.h +++ /dev/null @@ -1,56 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/krusty/src/uds/compiler.h#10 $ - */ - -#ifndef COMMON_COMPILER_H -#define COMMON_COMPILER_H - -#include - -#if LINUX_VERSION_CODE >= KERNEL_VERSION(5,9,0) -#include -#endif // >=5.9.0 - -#include - -// Count the elements in a static array while attempting to catch some type -// errors. (See http://stackoverflow.com/a/1598827 for an explanation.) -#define COUNT_OF(x) \ - ((sizeof(x) / sizeof(0 [x])) / \ - ((size_t)(!(sizeof(x) % sizeof(0 [x]))))) - - -#define const_container_of(ptr, type, member) \ - __extension__({ \ - const __typeof__(((type *) 0)->member) *__mptr = (ptr); \ - (const type *) ((const char *) __mptr - \ - offsetof(type, member)); \ - }) - -// The "inline" keyword alone takes effect only when the optimization level -// is high enough. Define INLINE to force the gcc to "always inline". -#define INLINE __attribute__((always_inline)) inline - - - -#define __STRING(x) #x - - -#endif /* COMMON_COMPILER_H */ diff --git a/uds/config.c b/uds/config.c deleted file mode 100644 index ea8a3e49..00000000 --- a/uds/config.c +++ /dev/null @@ -1,116 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/krusty/src/uds/config.c#20 $ - */ - -#include "config.h" - -#include "logger.h" -#include "memoryAlloc.h" -#include "stringUtils.h" - -/**********************************************************************/ -void free_index_location(struct index_location *loc) -{ - if (loc == NULL) { - return; - } - - UDS_FREE(loc->host); - UDS_FREE(loc->port); - UDS_FREE(loc->directory); -} - -/**********************************************************************/ -bool are_uds_configurations_equal(struct uds_configuration *a, - struct uds_configuration *b) -{ - bool result = true; - if (a->record_pages_per_chapter != b->record_pages_per_chapter) { - uds_log_error("Record pages per chapter (%u) does not match (%u)", - a->record_pages_per_chapter, - b->record_pages_per_chapter); - result = false; - } - if (a->chapters_per_volume != b->chapters_per_volume) { - uds_log_error("Chapter count (%u) does not match (%u)", - a->chapters_per_volume, - b->chapters_per_volume); - result = false; - } - if (a->sparse_chapters_per_volume != b->sparse_chapters_per_volume) { - uds_log_error("Sparse chapter count (%u) does not match (%u)", - a->sparse_chapters_per_volume, - b->sparse_chapters_per_volume); - result = false; - } - if (a->cache_chapters != b->cache_chapters) { - uds_log_error("Cache size (%u) does not match (%u)", - a->cache_chapters, - b->cache_chapters); - result = false; - } - if (a->volume_index_mean_delta != b->volume_index_mean_delta) { - uds_log_error("Volumee index mean delta (%u) does not match (%u)", - a->volume_index_mean_delta, - b->volume_index_mean_delta); - result = false; - } - if (a->bytes_per_page != b->bytes_per_page) { - uds_log_error("Bytes per page value (%u) does not match (%u)", - a->bytes_per_page, - b->bytes_per_page); - result = false; - } - if (a->sparse_sample_rate != b->sparse_sample_rate) { - uds_log_error("Sparse sample rate (%u) does not match (%u)", - a->sparse_sample_rate, - b->sparse_sample_rate); - result = false; - } - if (a->nonce != b->nonce) { - uds_log_error("Nonce (%llu) does not match (%llu)", - (unsigned long long) a->nonce, - (unsigned long long) b->nonce); - result = false; - } - return result; -} - -/**********************************************************************/ -void log_uds_configuration(struct uds_configuration *conf) -{ - uds_log_debug("Configuration:"); - uds_log_debug(" Record pages per chapter: %10u", - conf->record_pages_per_chapter); - uds_log_debug(" Chapters per volume: %10u", - conf->chapters_per_volume); - uds_log_debug(" Sparse chapters per volume: %10u", - conf->sparse_chapters_per_volume); - uds_log_debug(" Cache size (chapters): %10u", - conf->cache_chapters); - uds_log_debug(" Volume index mean delta: %10u", - conf->volume_index_mean_delta); - uds_log_debug(" Bytes per page: %10u", - conf->bytes_per_page); - uds_log_debug(" Sparse sample rate: %10u", - conf->sparse_sample_rate); - uds_log_debug(" Nonce: %llu", - (unsigned long long) conf->nonce); -} diff --git a/uds/config.h b/uds/config.h deleted file mode 100644 index cff03d99..00000000 --- a/uds/config.h +++ /dev/null @@ -1,171 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/krusty/src/uds/config.h#23 $ - */ - -#ifndef CONFIG_H -#define CONFIG_H - -#include "bufferedReader.h" -#include "bufferedWriter.h" -#include "geometry.h" -#include "uds.h" - -enum { - DEFAULT_VOLUME_INDEX_MEAN_DELTA = 4096, - DEFAULT_CACHE_CHAPTERS = 7, - DEFAULT_SPARSE_SAMPLE_RATE = 0 -}; - -/** - * Data that are used for configuring a new index. - **/ -struct uds_configuration { - /** Smaller (16), Small (64) or large (256) indices */ - unsigned int record_pages_per_chapter; - /** Total number of chapters per volume */ - unsigned int chapters_per_volume; - /** Number of sparse chapters per volume */ - unsigned int sparse_chapters_per_volume; - /** Size of the page cache, in chapters */ - unsigned int cache_chapters; - /** Frequency with which to checkpoint */ - // XXX the checkpoint_frequency is not used - it is now a runtime - // parameter - unsigned int checkpoint_frequency; - /** The volume index mean delta to use */ - unsigned int volume_index_mean_delta; - /** Size of a page, used for both record pages and index pages */ - unsigned int bytes_per_page; - /** Sampling rate for sparse indexing */ - unsigned int sparse_sample_rate; - /** Index Owner's nonce */ - uds_nonce_t nonce; - /** Virtual chapter remapped from physical chapter 0 */ - uint64_t remapped_virtual; - /** New physical chapter which remapped chapter was moved to */ - uint64_t remapped_physical; -}; - -/** - * Data that are used for configuring a 6.02 index. - **/ -struct uds_configuration_6_02 { - /** Smaller (16), Small (64) or large (256) indices */ - unsigned int record_pages_per_chapter; - /** Total number of chapters per volume */ - unsigned int chapters_per_volume; - /** Number of sparse chapters per volume */ - unsigned int sparse_chapters_per_volume; - /** Size of the page cache, in chapters */ - unsigned int cache_chapters; - /** Frequency with which to checkpoint */ - // XXX the checkpoint_frequency is not used - it is now a runtime - // parameter - unsigned int checkpoint_frequency; - /** The volume index mean delta to use */ - unsigned int volume_index_mean_delta; - /** Size of a page, used for both record pages and index pages */ - unsigned int bytes_per_page; - /** Sampling rate for sparse indexing */ - unsigned int sparse_sample_rate; - /** Index Owner's nonce */ - uds_nonce_t nonce; -}; - -struct index_location { - char *host; - char *port; - char *directory; -}; - -/** - * A set of configuration parameters for the indexer. - **/ -struct configuration; - -/** - * Construct a new indexer configuration. - * - * @param conf uds_configuration to use - * @param config_ptr The new index configuration - * - * @return UDS_SUCCESS or an error code - **/ -int __must_check make_configuration(const struct uds_configuration *conf, - struct configuration **config_ptr); - -/** - * Clean up the configuration struct. - **/ -void free_configuration(struct configuration *config); - -/** - * Read the index configuration from stable storage. - * - * @param reader A buffered reader. - * @param config The index configuration to overwrite. - * - * @return UDS_SUCCESS or an error code. - **/ -int __must_check read_config_contents(struct buffered_reader *reader, - struct uds_configuration *config); - -/** - * Write the index configuration information to stable storage. If - * the superblock version is < 4 write the 6.02 version; otherwise - * write the 8.02 version, indicating the configuration is for an - * index that has been reduced by one chapter. - * - * @param writer A buffered writer. - * @param config The index configuration. - * @param version The index superblock version - * - * @return UDS_SUCCESS or an error code. - **/ -int __must_check write_config_contents(struct buffered_writer *writer, - struct uds_configuration *config, - uint32_t version); - -/** - * Free the memory used by an index_location. - * - * @param loc index location to free - **/ -void free_index_location(struct index_location *loc); - -/** - * Compare two configurations for equality. - * - * @param a The first configuration to compare - * @param b The second configuration to compare - * - * @return true iff they are equal - **/ -bool __must_check are_uds_configurations_equal(struct uds_configuration *a, - struct uds_configuration *b); - -/** - * Log a user configuration. - * - * @param conf The configuration - **/ -void log_uds_configuration(struct uds_configuration *conf); - -#endif /* CONFIG_H */ diff --git a/uds/deltaIndex.c b/uds/deltaIndex.c deleted file mode 100644 index 8742f17c..00000000 --- a/uds/deltaIndex.c +++ /dev/null @@ -1,1848 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/krusty/src/uds/deltaIndex.c#34 $ - */ -#include "deltaIndex.h" - -#include "bits.h" -#include "buffer.h" -#include "compiler.h" -#include "cpu.h" -#include "errors.h" -#include "logger.h" -#include "memoryAlloc.h" -#include "permassert.h" -#include "stringUtils.h" -#include "typeDefs.h" -#include "uds.h" -#include "zone.h" - -/* - * A delta index is a key-value store, where each entry maps an address - * (the key) to a payload (the value). The entries are sorted by address, - * and only the delta between successive addresses is stored in the entry. - * The addresses are assumed to be uniformly distributed,and the deltas are - * therefore exponentially distributed. - * - * The entries could be stored in a single delta_list, but for efficiency we - * use multiple DeltaLists. These lists are stored in a single chunk of - * memory managed by the delta_memory module. The delta_memory module can - * move the data around in memory, so we never keep any byte pointers into - * delta_list memory. We only keep offsets into the memory. - * - * The delta lists are stored as bit streams. These bit streams are stored - * in little endian order, and all offsets into delta_memory are bit - * offsets. - * - * All entries are stored as a fixed length payload (the value) followed by a - * variable length key (the delta). Always strictly in little endian order. - * - * A collision entry is used when two block names have the same delta list - * address. A collision entry is encoded with DELTA==0, and has 256 - * extension bits containing the full block name. - * - * There is a special exception to be noted. The DELTA==0 encoding usually - * indicates a collision with the preceding entry. But for the first entry - * in any delta list there is no preceding entry, so the DELTA==0 encoding - * at the beginning of a delta list indicates a normal entry. - * - * The Huffman code is driven by 3 parameters: - * - * MINBITS This is the number of bits in the smallest code - * - * BASE This is the number of values coded using a code of length MINBITS - * - * INCR This is the number of values coded by using one additional bit. - * - * These parameters are related by: - * - * BASE + INCR == 1 << MINBITS - * - * When we create an index, we need to know the mean delta. From the mean - * delta, we compute these three parameters. The math for the Huffman code - * of an exponential distribution says that we compute: - * - * INCR = log(2) * MEAN_DELTA - * - * Then we find the smallest MINBITS so that - * - * 1 << MINBITS > INCR - * - * And then: - * - * BASE = (1 << MINBITS) - INCR - * - * Now we need a code such that - * - * - The first BASE values code using MINBITS bits - * - The next INCR values code using MINBITS+1 bits. - * - The next INCR values code using MINBITS+2 bits. - * - The next INCR values code using MINBITS+3 bits. - * - (and so on). - * - * ENCODE(DELTA): - * - * if (DELTA < BASE) { - * put DELTA in MINBITS bits; - * } else { - * T1 = (DELTA - BASE) % INCR + BASE; - * T2 = (DELTA - BASE) / INCR; - * put T1 in MINBITS bits; - * put 0 in T2 bits; - * put 1 in 1 bit; - * } - * - * DECODE(BIT_STREAM): - * - * T1 = next MINBITS bits of stream; - * if (T1 < BASE) { - * DELTA = T1; - * } else { - * Scan bits in the stream until reading a 1, - * setting T2 to the number of 0 bits read; - * DELTA = T2 * INCR + T1; - * } - * - * The bit field utilities that we use on the delta lists assume that it is - * possible to read a few bytes beyond the end of the bit field. So we - * make sure to allocates some extra bytes at the end of memory containing - * the delta lists. Look for POST_FIELD_GUARD_BYTES to find the code - * related to this. - * - * And note that the decode bit stream code includes a step that skips over - * 0 bits until the first 1 bit is found. A corrupted delta list could - * cause this step to run off the end of the delta list memory. As an - * extra protection against this happening, the guard bytes at the end - * should be set to all ones. - */ - -/** - * Constants and structures for the saved delta index. "DI" is for - * delta_index, and -##### is a number to increment when the format of the - * data changes. - **/ -enum { MAGIC_SIZE = 8 }; -static const char MAGIC_DI_START[] = "DI-00002"; - -struct di_header { - char magic[MAGIC_SIZE]; // MAGIC_DI_START - uint32_t zone_number; - uint32_t num_zones; - uint32_t first_list; - uint32_t num_lists; - uint64_t record_count; - uint64_t collision_count; -}; - -//********************************************************************** -// Methods for dealing with mutable delta list headers -//********************************************************************** - -/** - * Move the start of the delta list bit stream without moving the end. - * - * @param delta_list The delta list header - * @param increment The change in the start of the delta list - **/ -static INLINE void move_delta_list_start(struct delta_list *delta_list, - int increment) -{ - delta_list->start_offset += increment; - delta_list->size -= increment; -} - -/** - * Move the end of the delta list bit stream without moving the start. - * - * @param delta_list The delta list header - * @param increment The change in the end of the delta list - **/ -static INLINE void move_delta_list_end(struct delta_list *delta_list, - int increment) -{ - delta_list->size += increment; -} - -//********************************************************************** -// Methods for dealing with immutable delta list headers packed -//********************************************************************** - -// Header data used for immutable delta index pages. These data are -// followed by the delta list offset table. -struct delta_page_header { - uint64_t nonce; // Externally-defined nonce - uint64_t virtual_chapter_number; // The virtual chapter number - uint16_t first_list; // Index of the first delta list on - // the page - uint16_t num_lists; // Number of delta lists on the page -} __packed; - -// Immutable delta lists are packed into pages containing a header that -// encodes the delta list information into 19 bits per list (64KB bit offset) - -enum { IMMUTABLE_HEADER_SIZE = 19 }; - -/** - * Get the bit offset to the immutable delta list header - * - * @param list_number The delta list number - * - * @return the offset of immutable delta list header - **/ -static INLINE unsigned int get_immutable_header_offset(unsigned int list_number) -{ - return (sizeof(struct delta_page_header) * CHAR_BIT + - list_number * IMMUTABLE_HEADER_SIZE); -} - -/** - * Get the bit offset to the start of the immutable delta list bit stream - * - * @param memory The memory page containing the delta lists - * @param list_number The delta list number - * - * @return the start of the delta list - **/ -static INLINE unsigned int get_immutable_start(const byte *memory, - unsigned int list_number) -{ - return get_field(memory, - get_immutable_header_offset(list_number), - IMMUTABLE_HEADER_SIZE); -} - -/** - * Set the bit offset to the start of the immutable delta list bit stream - * - * @param memory The memory page containing the delta lists - * @param list_number The delta list number - * @param start_offset The start of the delta list - **/ -static INLINE void set_immutable_start(byte *memory, - unsigned int list_number, - unsigned int start_offset) -{ - set_field(start_offset, - memory, - get_immutable_header_offset(list_number), - IMMUTABLE_HEADER_SIZE); -} - -//********************************************************************** -// Methods for dealing with Delta List Entries -//********************************************************************** - -/** - * Decode a delta index entry delta value. The delta_index_entry basically - * describes the previous list entry, and has had its offset field changed to - * point to the subsequent entry. We decode the bit stream and update the - * DeltaListEntry to describe the entry. - * - * @param delta_entry The delta index entry - **/ -static INLINE void decode_delta(struct delta_index_entry *delta_entry) -{ - int key_bits; - unsigned int delta; - const struct delta_memory *delta_zone = delta_entry->delta_zone; - const byte *memory = delta_zone->memory; - uint64_t delta_offset = - get_delta_entry_offset(delta_entry) + delta_entry->value_bits; - const byte *addr = memory + delta_offset / CHAR_BIT; - int offset = delta_offset % CHAR_BIT; - uint32_t data = get_unaligned_le32(addr) >> offset; - addr += sizeof(uint32_t); - key_bits = delta_zone->min_bits; - delta = data & ((1 << key_bits) - 1); - if (delta >= delta_zone->min_keys) { - data >>= key_bits; - if (data == 0) { - key_bits = sizeof(uint32_t) * CHAR_BIT - offset; - while ((data = get_unaligned_le32(addr)) == 0) { - addr += sizeof(uint32_t); - key_bits += sizeof(uint32_t) * CHAR_BIT; - } - } - key_bits += ffs(data); - delta += (key_bits - delta_zone->min_bits - 1) * - delta_zone->incr_keys; - } - delta_entry->delta = delta; - delta_entry->key += delta; - - // Check for a collision, a delta of zero not at the start of the list. - if (unlikely((delta == 0) && (delta_entry->offset > 0))) { - delta_entry->is_collision = true; - // The small duplication of this math in the two arms of this - // if statement makes a tiny but measurable difference in - // performance. - delta_entry->entry_bits = - delta_entry->value_bits + key_bits + COLLISION_BITS; - } else { - delta_entry->is_collision = false; - delta_entry->entry_bits = delta_entry->value_bits + key_bits; - } -} - -/** - * Delete bits from a delta list at the offset of the specified delta index - * entry. - * - * @param delta_entry The delta index entry - * @param size The number of bits to delete - **/ -static void delete_bits(const struct delta_index_entry *delta_entry, int size) -{ - uint64_t source, destination; - uint32_t count; - bool before_flag; - struct delta_list *delta_list = delta_entry->delta_list; - byte *memory = delta_entry->delta_zone->memory; - // Compute how many bits are retained before and after the deleted bits - uint32_t total_size = get_delta_list_size(delta_list); - uint32_t before_size = delta_entry->offset; - uint32_t after_size = total_size - delta_entry->offset - size; - - // Determine whether to add to the available space either before or - // after the delta list. We prefer to move the least amount of data. - // If it is exactly the same, try to add to the smaller amount of free - // space. - if (before_size < after_size) { - before_flag = true; - } else if (after_size < before_size) { - before_flag = false; - } else { - uint64_t free_before = - get_delta_list_start(&delta_list[0]) - - get_delta_list_end(&delta_list[-1]); - uint64_t free_after = - get_delta_list_start(&delta_list[1]) - - get_delta_list_end(&delta_list[0]); - before_flag = free_before < free_after; - } - - if (before_flag) { - source = get_delta_list_start(delta_list); - destination = source + size; - move_delta_list_start(delta_list, size); - count = before_size; - } else { - move_delta_list_end(delta_list, -size); - destination = - get_delta_list_start(delta_list) + delta_entry->offset; - source = destination + size; - count = after_size; - } - move_bits(memory, source, memory, destination, count); -} - -/** - * Get the offset of the collision field in a delta_index_entry - * - * @param entry The delta index record - * - * @return the offset of the start of the collision name - **/ -static INLINE uint64_t -get_collision_offset(const struct delta_index_entry *entry) -{ - return (get_delta_entry_offset(entry) + entry->entry_bits - - COLLISION_BITS); -} - -/** - * Encode a delta index entry delta. - * - * @param delta_entry The delta index entry - **/ -static void encode_delta(const struct delta_index_entry *delta_entry) -{ - unsigned int temp, t1, t2; - const struct delta_memory *delta_zone = delta_entry->delta_zone; - byte *memory = delta_zone->memory; - uint64_t offset = - get_delta_entry_offset(delta_entry) + delta_entry->value_bits; - if (delta_entry->delta < delta_zone->min_keys) { - set_field(delta_entry->delta, - memory, - offset, - delta_zone->min_bits); - return; - } - temp = delta_entry->delta - delta_zone->min_keys; - t1 = (temp % delta_zone->incr_keys) + delta_zone->min_keys; - t2 = temp / delta_zone->incr_keys; - set_field(t1, memory, offset, delta_zone->min_bits); - set_zero(memory, offset + delta_zone->min_bits, t2); - set_one(memory, offset + delta_zone->min_bits + t2, 1); -} - -/** - * Encode a delta index entry. - * - * @param delta_entry The delta index entry - * @param value The value associated with the entry - * @param name For collision entries, the 256 bit full name. - **/ -static void encode_entry(const struct delta_index_entry *delta_entry, - unsigned int value, - const byte *name) -{ - byte *memory = delta_entry->delta_zone->memory; - uint64_t offset = get_delta_entry_offset(delta_entry); - set_field(value, memory, offset, delta_entry->value_bits); - encode_delta(delta_entry); - if (name != NULL) { - set_bytes(memory, - get_collision_offset(delta_entry), - name, - COLLISION_BYTES); - } -} - -/** - * Insert bits into a delta list at the offset of the specified delta index - * entry. - * - * @param delta_entry The delta index entry - * @param size The number of bits to insert - * - * @return UDS_SUCCESS or an error code - **/ -static int insert_bits(struct delta_index_entry *delta_entry, int size) -{ - uint64_t free_before, free_after, source, destination; - uint32_t count; - bool before_flag; - byte *memory; - struct delta_memory *delta_zone = delta_entry->delta_zone; - struct delta_list *delta_list = delta_entry->delta_list; - // Compute how many bits are in use before and after the inserted bits - uint32_t total_size = get_delta_list_size(delta_list); - uint32_t before_size = delta_entry->offset; - uint32_t after_size = total_size - delta_entry->offset; - if ((unsigned int) (total_size + size) > UINT16_MAX) { - delta_entry->list_overflow = true; - delta_zone->overflow_count++; - return UDS_OVERFLOW; - } - - // Compute how many bits are available before and after the delta list - free_before = get_delta_list_start(&delta_list[0]) - - get_delta_list_end(&delta_list[-1]); - free_after = get_delta_list_start(&delta_list[1]) - - get_delta_list_end(&delta_list[0]); - - if (((unsigned int) size <= free_before) && - ((unsigned int) size <= free_after)) { - // We have enough space to use either before or after the list. - // Prefer to move the least amount of data. If it is exactly - // the same, try to take from the larger amount of free space. - if (before_size < after_size) { - before_flag = true; - } else if (after_size < before_size) { - before_flag = false; - } else { - before_flag = free_before > free_after; - } - } else if ((unsigned int) size <= free_before) { - // There is space before but not after - before_flag = true; - } else if ((unsigned int) size <= free_after) { - // There is space after but not before - before_flag = false; - } else { - // Neither of the surrounding spaces is large enough for this - // request, Extend and/or rebalance the delta list memory - // choosing to move the least amount of data. - int result; - unsigned int growing_index = delta_entry->list_number + 1; - before_flag = before_size < after_size; - if (!before_flag) { - growing_index++; - } - result = - extend_delta_memory(delta_zone, - growing_index, - (size + CHAR_BIT - 1) / CHAR_BIT, - true); - if (result != UDS_SUCCESS) { - return result; - } - } - - if (before_flag) { - source = get_delta_list_start(delta_list); - destination = source - size; - move_delta_list_start(delta_list, -size); - count = before_size; - } else { - move_delta_list_end(delta_list, size); - source = - get_delta_list_start(delta_list) + delta_entry->offset; - destination = source + size; - count = after_size; - } - memory = delta_zone->memory; - move_bits(memory, source, memory, destination, count); - return UDS_SUCCESS; -} - -/** - * Get the amount of memory to allocate for each zone - * - * @param num_zones The number of zones in the index - * @param memory_size The number of bytes in memory for the index - * - * @return the number of bytes to allocate for a single zone - **/ -static INLINE size_t get_zone_memory_size(unsigned int num_zones, - size_t memory_size) -{ - size_t zone_size = memory_size / num_zones; - // Round the size up so that each zone is a multiple of 64K in size. - enum { ALLOC_BOUNDARY = 64 * KILOBYTE }; - return (zone_size + ALLOC_BOUNDARY - 1) & -ALLOC_BOUNDARY; -} - -/** - * Validate delta index parameters - * - * @param mean_delta The mean delta value - * @param num_payload_bits The number of bits in the payload or value - **/ -static bool invalid_parameters(unsigned int mean_delta, - unsigned int num_payload_bits) -{ - const unsigned int min_delta = 10; - const unsigned int max_delta = 1 << MAX_FIELD_BITS; - if ((mean_delta < min_delta) || (mean_delta > max_delta)) { - uds_log_warning("error initializing delta index: mean delta (%u) is not in the range %u to %u", - mean_delta, - min_delta, - max_delta); - return true; - } - if (num_payload_bits > MAX_FIELD_BITS) { - uds_log_warning("error initializing delta index: Too many payload bits (%u)", - num_payload_bits); - return true; - } - return false; -} - -/** - * Set a delta index entry to be a collision - * - * @param delta_entry The delta index entry - **/ -static void set_collision(struct delta_index_entry *delta_entry) -{ - delta_entry->is_collision = true; - delta_entry->entry_bits += COLLISION_BITS; -} - -/** - * Set the delta in a delta index entry. - * - * @param delta_entry The delta index entry - * @param delta The new delta - **/ -static void set_delta(struct delta_index_entry *delta_entry, unsigned int delta) -{ - const struct delta_memory *delta_zone = delta_entry->delta_zone; - int key_bits = delta_zone->min_bits + - ((delta_zone->incr_keys - - delta_zone->min_keys + delta) / - delta_zone->incr_keys); - delta_entry->delta = delta; - delta_entry->entry_bits = delta_entry->value_bits + key_bits; -} - -//********************************************************************** -// External functions declared in delta_index.h -//********************************************************************** - -int initialize_delta_index(struct delta_index *delta_index, - unsigned int num_zones, - unsigned int num_lists, - unsigned int mean_delta, - unsigned int num_payload_bits, - size_t memory_size) -{ - int result; - unsigned int z; - size_t mem_size = get_zone_memory_size(num_zones, memory_size); - if (invalid_parameters(mean_delta, num_payload_bits)) { - return UDS_INVALID_ARGUMENT; - } - - result = UDS_ALLOCATE(num_zones, - struct delta_memory, - "Delta Index Zones", - &delta_index->delta_zones); - if (result != UDS_SUCCESS) { - return result; - } - - delta_index->num_zones = num_zones; - delta_index->num_lists = num_lists; - delta_index->lists_per_zone = (num_lists + num_zones - 1) / num_zones; - delta_index->is_mutable = true; - delta_index->tag = 'm'; - - for (z = 0; z < num_zones; z++) { - unsigned int first_list_in_zone = - z * delta_index->lists_per_zone; - unsigned int num_lists_in_zone = delta_index->lists_per_zone; - if (z == num_zones - 1) { - /* - * The last zone gets fewer lists if num_zones doesn't - * evenly divide num_lists. We'll have an underflow if - * the assertion below doesn't hold. (And it turns out - * that the assertion is equivalent to num_zones <= 1 + - * (num_lists / num_zones) + (num_lists % num_zones) in - * the case that num_zones doesn't evenly divide - * numlists. If num_lists >= num_zones * num_zones, - * then the above inequality will always hold.) - */ - if (delta_index->num_lists <= first_list_in_zone) { - uninitialize_delta_index(delta_index); - return uds_log_error_strerror(UDS_INVALID_ARGUMENT, - "%u delta-lists not enough for %u zones", - num_lists, - num_zones); - } - num_lists_in_zone = - delta_index->num_lists - first_list_in_zone; - } - result = initialize_delta_memory(&delta_index->delta_zones[z], - mem_size, - first_list_in_zone, - num_lists_in_zone, - mean_delta, - num_payload_bits); - if (result != UDS_SUCCESS) { - uninitialize_delta_index(delta_index); - return result; - } - } - return UDS_SUCCESS; -} - -/**********************************************************************/ -static bool verify_delta_index_page(uint64_t nonce, - uint16_t num_lists, - uint64_t expected_nonce, - byte *memory, - size_t mem_size) -{ - unsigned int i; - // Verify the nonce. A mismatch here happens in normal operation when - // we are doing a rebuild but haven't written the entire volume once. - if (nonce != expected_nonce) { - return false; - } - - // Verify that the number of delta lists can fit in the page. - if (num_lists > (mem_size - sizeof(struct delta_page_header)) * - CHAR_BIT / IMMUTABLE_HEADER_SIZE) { - return false; - } - - // Verify that the first delta list is immediately after the last delta - // list header. - if (get_immutable_start(memory, 0) != - get_immutable_header_offset(num_lists + 1)) { - return false; - } - - // Verify that the lists are in the correct order. - for (i = 0; i < num_lists; i++) { - if (get_immutable_start(memory, i) > - get_immutable_start(memory, i + 1)) { - return false; - } - } - - // Verify that the last list ends on the page, and that there is room - // for the post-field guard bits. - if (get_immutable_start(memory, num_lists) > - (mem_size - POST_FIELD_GUARD_BYTES) * CHAR_BIT) { - return false; - } - - // Verify that the guard bytes are correctly set to all ones. - for (i = 0; i < POST_FIELD_GUARD_BYTES; i++) { - byte guard_byte = memory[mem_size - POST_FIELD_GUARD_BYTES + i]; - if (guard_byte != (byte) ~0) { - return false; - } - } - - // All verifications passed. - return true; -} - -/**********************************************************************/ -int initialize_delta_index_page(struct delta_index_page *delta_index_page, - uint64_t expected_nonce, - unsigned int mean_delta, - unsigned int num_payload_bits, - byte *memory, - size_t mem_size) -{ - uint64_t nonce, vcn, first_list, num_lists; - const struct delta_page_header *header = - (const struct delta_page_header *) memory; - - if (invalid_parameters(mean_delta, num_payload_bits)) { - return UDS_INVALID_ARGUMENT; - } - - // First assume that the header is little endian - nonce = get_unaligned_le64((const byte *) &header->nonce); - vcn = get_unaligned_le64((const byte *) &header->virtual_chapter_number); - first_list = get_unaligned_le16((const byte *) &header->first_list); - num_lists = get_unaligned_le16((const byte *) &header->num_lists); - if (!verify_delta_index_page(nonce, num_lists, expected_nonce, memory, - mem_size)) { - // That failed, so try big endian - nonce = get_unaligned_be64((const byte *) &header->nonce); - vcn = get_unaligned_be64((const byte *) &header->virtual_chapter_number); - first_list = - get_unaligned_be16((const byte *) &header->first_list); - num_lists = - get_unaligned_be16((const byte *) &header->num_lists); - if (!verify_delta_index_page(nonce, - num_lists, - expected_nonce, - memory, - mem_size)) { - // Also failed. Do not log this as an error. It - // happens in normal operation when we are doing a - // rebuild but haven't written the entire volume once. - return UDS_CORRUPT_COMPONENT; - } - } - - delta_index_page->delta_index.delta_zones = - &delta_index_page->delta_memory; - delta_index_page->delta_index.num_zones = 1; - delta_index_page->delta_index.num_lists = num_lists; - delta_index_page->delta_index.lists_per_zone = num_lists; - delta_index_page->delta_index.is_mutable = false; - delta_index_page->delta_index.tag = 'p'; - delta_index_page->virtual_chapter_number = vcn; - delta_index_page->lowest_list_number = first_list; - delta_index_page->highest_list_number = first_list + num_lists - 1; - - initialize_delta_memory_page(&delta_index_page->delta_memory, - (byte *) memory, - mem_size, - num_lists, - mean_delta, - num_payload_bits); - return UDS_SUCCESS; -} - -/**********************************************************************/ -void uninitialize_delta_index(struct delta_index *delta_index) -{ - if (delta_index != NULL) { - unsigned int z; - for (z = 0; z < delta_index->num_zones; z++) { - uninitialize_delta_memory(&delta_index->delta_zones[z]); - } - UDS_FREE(delta_index->delta_zones); - memset(delta_index, 0, sizeof(struct delta_index)); - } -} - -/**********************************************************************/ -void empty_delta_index(const struct delta_index *delta_index) -{ - unsigned int z; - for (z = 0; z < delta_index->num_zones; z++) { - empty_delta_lists(&delta_index->delta_zones[z]); - } -} - -/**********************************************************************/ -void empty_delta_index_zone(const struct delta_index *delta_index, - unsigned int zone_number) -{ - empty_delta_lists(&delta_index->delta_zones[zone_number]); -} - -/**********************************************************************/ -int pack_delta_index_page(const struct delta_index *delta_index, - uint64_t header_nonce, - byte *memory, - size_t mem_size, - uint64_t virtual_chapter_number, - unsigned int first_list, - unsigned int *num_lists) -{ - const struct delta_memory *delta_zone; - struct delta_list *delta_lists; - unsigned int max_lists, n_lists = 0, offset, i; - int num_bits; - struct delta_page_header *header; - if (!delta_index->is_mutable) { - return uds_log_error_strerror(UDS_BAD_STATE, - "Cannot pack an immutable index"); - } - if (delta_index->num_zones != 1) { - return uds_log_error_strerror(UDS_BAD_STATE, - "Cannot pack a delta index page when the index has %u zones", - delta_index->num_zones); - } - if (first_list > delta_index->num_lists) { - return uds_log_error_strerror(UDS_BAD_STATE, - "Cannot pack a delta index page when the first list (%u) is larger than the number of lists (%u)", - first_list, - delta_index->num_lists); - } - - delta_zone = &delta_index->delta_zones[0]; - delta_lists = - &delta_zone->delta_lists[first_list + 1]; - max_lists = delta_index->num_lists - first_list; - - // Compute how many lists will fit on the page - num_bits = mem_size * CHAR_BIT; - // Subtract the size of the fixed header and 1 delta list offset - num_bits -= get_immutable_header_offset(1); - // Subtract the guard bytes of memory so that allow us to freely read a - // short distance past the end of any byte we are interested in. - num_bits -= POST_FIELD_GUARD_BYTES * CHAR_BIT; - if (num_bits < IMMUTABLE_HEADER_SIZE) { - // This page is too small to contain even one empty delta list - return uds_log_error_strerror(UDS_OVERFLOW, - "Chapter Index Page of %zu bytes is too small", - mem_size); - } - - while (n_lists < max_lists) { - // Each list requires 1 delta list offset and the list data - int bits = IMMUTABLE_HEADER_SIZE + - get_delta_list_size(&delta_lists[n_lists]); - if (bits > num_bits) { - break; - } - n_lists++; - num_bits -= bits; - } - *num_lists = n_lists; - - // Construct the page header - header = (struct delta_page_header *) memory; - put_unaligned_le64(header_nonce, (byte *) &header->nonce); - put_unaligned_le64(virtual_chapter_number, - (byte *) &header->virtual_chapter_number); - put_unaligned_le16(first_list, (byte *) &header->first_list); - put_unaligned_le16(n_lists, (byte *) &header->num_lists); - - // Construct the delta list offset table, making sure that the memory - // page is large enough. - offset = get_immutable_header_offset(n_lists + 1); - set_immutable_start(memory, 0, offset); - for (i = 0; i < n_lists; i++) { - offset += get_delta_list_size(&delta_lists[i]); - set_immutable_start(memory, i + 1, offset); - } - - // Copy the delta list data onto the memory page - for (i = 0; i < n_lists; i++) { - struct delta_list *delta_list = &delta_lists[i]; - move_bits(delta_zone->memory, - get_delta_list_start(delta_list), - memory, - get_immutable_start(memory, i), - get_delta_list_size(delta_list)); - } - - // Set all the bits in the guard bytes. Do not use the bit field - // utilities. - memset(memory + mem_size - POST_FIELD_GUARD_BYTES, - ~0, - POST_FIELD_GUARD_BYTES); - return UDS_SUCCESS; -} - - -/**********************************************************************/ -void set_delta_index_tag(struct delta_index *delta_index, byte tag) -{ - unsigned int z; - delta_index->tag = tag; - for (z = 0; z < delta_index->num_zones; z++) { - delta_index->delta_zones[z].tag = tag; - } -} - -/**********************************************************************/ -static int __must_check decode_delta_index_header(struct buffer *buffer, - struct di_header *header) -{ - int result = get_bytes_from_buffer(buffer, MAGIC_SIZE, &header->magic); - if (result != UDS_SUCCESS) { - return result; - } - result = get_uint32_le_from_buffer(buffer, &header->zone_number); - if (result != UDS_SUCCESS) { - return result; - } - result = get_uint32_le_from_buffer(buffer, &header->num_zones); - if (result != UDS_SUCCESS) { - return result; - } - result = get_uint32_le_from_buffer(buffer, &header->first_list); - if (result != UDS_SUCCESS) { - return result; - } - result = get_uint32_le_from_buffer(buffer, &header->num_lists); - if (result != UDS_SUCCESS) { - return result; - } - result = get_uint64_le_from_buffer(buffer, &header->record_count); - if (result != UDS_SUCCESS) { - return result; - } - result = get_uint64_le_from_buffer(buffer, &header->collision_count); - if (result != UDS_SUCCESS) { - return result; - } - result = ASSERT_LOG_ONLY(content_length(buffer) == 0, - "%zu bytes decoded of %zu expected", - buffer_length(buffer) - content_length(buffer), - buffer_length(buffer)); - return result; -} - -/**********************************************************************/ -static int __must_check read_delta_index_header(struct buffered_reader *reader, - struct di_header *header) -{ - struct buffer *buffer; - - int result = make_buffer(sizeof(*header), &buffer); - if (result != UDS_SUCCESS) { - return result; - } - - result = read_from_buffered_reader(reader, get_buffer_contents(buffer), - buffer_length(buffer)); - if (result != UDS_SUCCESS) { - free_buffer(UDS_FORGET(buffer)); - return uds_log_warning_strerror(result, - "failed to read delta index header"); - } - - result = reset_buffer_end(buffer, buffer_length(buffer)); - if (result != UDS_SUCCESS) { - free_buffer(UDS_FORGET(buffer)); - return result; - } - - result = decode_delta_index_header(buffer, header); - free_buffer(UDS_FORGET(buffer)); - return result; -} - -/**********************************************************************/ -int start_restoring_delta_index(const struct delta_index *delta_index, - struct buffered_reader **buffered_readers, - int num_readers) -{ - unsigned int num_zones = num_readers; - unsigned long record_count = 0, collision_count = 0; - unsigned int first_list[MAX_ZONES], num_lists[MAX_ZONES]; - struct buffered_reader *reader[MAX_ZONES]; - unsigned int z, list_next = 0; - bool zone_flags[MAX_ZONES] = { - false, - }; - - if (!delta_index->is_mutable) { - return uds_log_error_strerror(UDS_BAD_STATE, - "Cannot restore to an immutable index"); - } - if (num_readers <= 0) { - return uds_log_warning_strerror(UDS_INVALID_ARGUMENT, - "No delta index files"); - } - - if (num_zones > MAX_ZONES) { - return uds_log_error_strerror(UDS_INVALID_ARGUMENT, - "zone count %u must not exceed MAX_ZONES", - num_zones); - } - - // Read the header from each file, and make sure we have a matching set - for (z = 0; z < num_zones; z++) { - struct di_header header; - int result = - read_delta_index_header(buffered_readers[z], &header); - if (result != UDS_SUCCESS) { - return uds_log_warning_strerror(result, - "failed to read delta index header"); - } - if (memcmp(header.magic, MAGIC_DI_START, MAGIC_SIZE) != 0) { - return uds_log_warning_strerror(UDS_CORRUPT_COMPONENT, - "delta index file has bad magic number"); - } - if (num_zones != header.num_zones) { - return uds_log_warning_strerror(UDS_CORRUPT_COMPONENT, - "delta index files contain mismatched zone counts (%u,%u)", - num_zones, - header.num_zones); - } - if (header.zone_number >= num_zones) { - return uds_log_warning_strerror(UDS_CORRUPT_COMPONENT, - "delta index files contains zone %u of %u zones", - header.zone_number, - num_zones); - } - if (zone_flags[header.zone_number]) { - return uds_log_warning_strerror(UDS_CORRUPT_COMPONENT, - "delta index files contain two of zone %u", - header.zone_number); - } - reader[header.zone_number] = buffered_readers[z]; - first_list[header.zone_number] = header.first_list; - num_lists[header.zone_number] = header.num_lists; - zone_flags[header.zone_number] = true; - record_count += header.record_count; - collision_count += header.collision_count; - } - for (z = 0; z < num_zones; z++) { - if (first_list[z] != list_next) { - return uds_log_warning_strerror(UDS_CORRUPT_COMPONENT, - "delta index file for zone %u starts with list %u instead of list %u", - z, - first_list[z], - list_next); - } - list_next += num_lists[z]; - } - if (list_next != delta_index->num_lists) { - return uds_log_warning_strerror(UDS_CORRUPT_COMPONENT, - "delta index files contain %u delta lists instead of %u delta lists", - list_next, - delta_index->num_lists); - } - if (collision_count > record_count) { - return uds_log_warning_strerror(UDS_CORRUPT_COMPONENT, - "delta index files contain %ld collisions and %ld records", - collision_count, - record_count); - } - - empty_delta_index(delta_index); - delta_index->delta_zones[0].record_count = record_count; - delta_index->delta_zones[0].collision_count = collision_count; - - // Read the delta list sizes from the files, and distribute each of - // them to proper zone - for (z = 0; z < num_zones; z++) { - unsigned int i; - for (i = 0; i < num_lists[z]; i++) { - uint16_t delta_list_size; - unsigned int list_number, zone_number; - const struct delta_memory *delta_zone; - byte delta_list_size_data[sizeof(uint16_t)]; - int result = - read_from_buffered_reader(reader[z], - delta_list_size_data, - sizeof(delta_list_size_data)); - if (result != UDS_SUCCESS) { - return uds_log_warning_strerror(result, - "failed to read delta index size"); - } - delta_list_size = get_unaligned_le16(delta_list_size_data); - list_number = first_list[z] + i; - zone_number = get_delta_index_zone(delta_index, list_number); - delta_zone = &delta_index->delta_zones[zone_number]; - list_number -= delta_zone->first_list; - delta_zone->delta_lists[list_number + 1].size = - delta_list_size; - } - } - - // Prepare each zone to start receiving the delta list data - for (z = 0; z < delta_index->num_zones; z++) { - int result = - start_restoring_delta_memory(&delta_index->delta_zones[z]); - if (result != UDS_SUCCESS) { - return result; - } - } - return UDS_SUCCESS; -} - -/**********************************************************************/ -bool is_restoring_delta_index_done(const struct delta_index *delta_index) -{ - unsigned int z; - for (z = 0; z < delta_index->num_zones; z++) { - if (!are_delta_memory_transfers_done(&delta_index->delta_zones[z])) { - return false; - } - } - return true; -} - -/**********************************************************************/ -int restore_delta_list_to_delta_index(const struct delta_index *delta_index, - const struct delta_list_save_info *dlsi, - const byte data[DELTA_LIST_MAX_BYTE_COUNT]) -{ - unsigned int zone_number; - // Make sure the data are intended for this delta list. Do not - // log an error, as this may be valid data for another delta index. - if (dlsi->tag != delta_index->tag) { - return UDS_CORRUPT_COMPONENT; - } - - if (dlsi->index >= delta_index->num_lists) { - return uds_log_warning_strerror(UDS_CORRUPT_COMPONENT, - "invalid delta list number %u of %u", - dlsi->index, - delta_index->num_lists); - } - - zone_number = get_delta_index_zone(delta_index, dlsi->index); - return restore_delta_list(&delta_index->delta_zones[zone_number], - dlsi, data); -} - -/**********************************************************************/ -void abort_restoring_delta_index(const struct delta_index *delta_index) -{ - unsigned int z; - for (z = 0; z < delta_index->num_zones; z++) { - abort_restoring_delta_memory(&delta_index->delta_zones[z]); - } -} - -/**********************************************************************/ -static int __must_check encode_delta_index_header(struct buffer *buffer, - struct di_header *header) -{ - int result = put_bytes(buffer, MAGIC_SIZE, MAGIC_DI_START); - if (result != UDS_SUCCESS) { - return result; - } - result = put_uint32_le_into_buffer(buffer, header->zone_number); - if (result != UDS_SUCCESS) { - return result; - } - result = put_uint32_le_into_buffer(buffer, header->num_zones); - if (result != UDS_SUCCESS) { - return result; - } - result = put_uint32_le_into_buffer(buffer, header->first_list); - if (result != UDS_SUCCESS) { - return result; - } - result = put_uint32_le_into_buffer(buffer, header->num_lists); - if (result != UDS_SUCCESS) { - return result; - } - result = put_uint64_le_into_buffer(buffer, header->record_count); - if (result != UDS_SUCCESS) { - return result; - } - result = put_uint64_le_into_buffer(buffer, header->collision_count); - if (result != UDS_SUCCESS) { - return result; - } - result = ASSERT_LOG_ONLY(content_length(buffer) == sizeof(*header), - "%zu bytes encoded of %zu expected", - content_length(buffer), - sizeof(*header)); - - return result; -} - -/**********************************************************************/ -int start_saving_delta_index(const struct delta_index *delta_index, - unsigned int zone_number, - struct buffered_writer *buffered_writer) -{ - struct buffer *buffer; - int result; - unsigned int i; - struct delta_memory *delta_zone = - &delta_index->delta_zones[zone_number]; - struct di_header header; - memcpy(header.magic, MAGIC_DI_START, MAGIC_SIZE); - header.zone_number = zone_number; - header.num_zones = delta_index->num_zones; - header.first_list = delta_zone->first_list; - header.num_lists = delta_zone->num_lists; - header.record_count = delta_zone->record_count; - header.collision_count = delta_zone->collision_count; - - result = make_buffer(sizeof(struct di_header), &buffer); - if (result != UDS_SUCCESS) { - return result; - } - - result = encode_delta_index_header(buffer, &header); - if (result != UDS_SUCCESS) { - free_buffer(UDS_FORGET(buffer)); - return result; - } - - result = write_to_buffered_writer(buffered_writer, - get_buffer_contents(buffer), - content_length(buffer)); - free_buffer(UDS_FORGET(buffer)); - if (result != UDS_SUCCESS) { - return uds_log_warning_strerror(result, - "failed to write delta index header"); - } - - for (i = 0; i < delta_zone->num_lists; i++) { - uint16_t delta_list_size = - get_delta_list_size(&delta_zone->delta_lists[i + 1]); - byte data[2]; - put_unaligned_le16(delta_list_size, data); - result = write_to_buffered_writer(buffered_writer, data, - sizeof(data)); - if (result != UDS_SUCCESS) { - return uds_log_warning_strerror(result, - "failed to write delta list size"); - } - } - - start_saving_delta_memory(delta_zone, buffered_writer); - return UDS_SUCCESS; -} - -/**********************************************************************/ -bool is_saving_delta_index_done(const struct delta_index *delta_index, - unsigned int zone_number) -{ - return are_delta_memory_transfers_done(&delta_index->delta_zones[zone_number]); -} - -/**********************************************************************/ -int finish_saving_delta_index(const struct delta_index *delta_index, - unsigned int zone_number) -{ - return finish_saving_delta_memory(&delta_index->delta_zones[zone_number]); -} - -/**********************************************************************/ -int abort_saving_delta_index(const struct delta_index *delta_index, - unsigned int zone_number) -{ - abort_saving_delta_memory(&delta_index->delta_zones[zone_number]); - return UDS_SUCCESS; -} - -/**********************************************************************/ -size_t compute_delta_index_save_bytes(unsigned int num_lists, - size_t memory_size) -{ - // The exact amount of memory used depends upon the number of zones. - // Compute the maximum potential memory size. - size_t max_mem_size = memory_size; - unsigned int num_zones; - for (num_zones = 1; num_zones <= MAX_ZONES; num_zones++) { - size_t mem_size = get_zone_memory_size(num_zones, memory_size); - if (mem_size > max_mem_size) { - max_mem_size = mem_size; - } - } - // Saving a delta index requires a header ... - return (sizeof(struct di_header) - // ... plus a delta_list_save_info per delta list - // plus an extra byte per delta list ... - + num_lists * (sizeof(struct delta_list_save_info) + 1) - // ... plus the delta list memory - + max_mem_size); -} - -/**********************************************************************/ -int validate_delta_index(const struct delta_index *delta_index) -{ - unsigned int z; - for (z = 0; z < delta_index->num_zones; z++) { - int result = - validate_delta_lists(&delta_index->delta_zones[z]); - if (result != UDS_SUCCESS) { - return result; - } - } - return UDS_SUCCESS; -} - -/**********************************************************************/ -static int assert_not_at_end(const struct delta_index_entry *delta_entry, - int error_code) -{ - return ASSERT_WITH_ERROR_CODE(!delta_entry->at_end, error_code, - "operation is invalid because the list entry is at the end of the delta list"); -} - -/**********************************************************************/ -static void prefetch_delta_list(const struct delta_memory *delta_zone, - const struct delta_list *delta_list) -{ - const byte *memory = delta_zone->memory; - const byte *addr = - &memory[get_delta_list_start(delta_list) / CHAR_BIT]; - unsigned int size = get_delta_list_size(delta_list) / CHAR_BIT; - prefetch_range(addr, size, false); -} - -/**********************************************************************/ -int start_delta_index_search(const struct delta_index *delta_index, - unsigned int list_number, - unsigned int key, - bool read_only, - struct delta_index_entry *delta_entry) -{ - unsigned int zone_number; - struct delta_memory *delta_zone; - struct delta_list *delta_list; - int result = ASSERT_WITH_ERROR_CODE((list_number < - delta_index->num_lists), - UDS_CORRUPT_DATA, - "Delta list number (%u) is out of range (%u)", - list_number, - delta_index->num_lists); - if (result != UDS_SUCCESS) { - return result; - } - - zone_number = get_delta_index_zone(delta_index, list_number); - delta_zone = &delta_index->delta_zones[zone_number]; - list_number -= delta_zone->first_list; - result = ASSERT_WITH_ERROR_CODE((list_number < delta_zone->num_lists), - UDS_CORRUPT_DATA, - "Delta list number (%u)" - " is out of range (%u) for zone (%u)", - list_number, - delta_zone->num_lists, - zone_number); - if (result != UDS_SUCCESS) { - return result; - } - - if (delta_index->is_mutable) { - delta_list = &delta_zone->delta_lists[list_number + 1]; - if (!read_only) { - // Here is the lazy writing of the index for a - // checkpoint - lazy_flush_delta_list(delta_zone, list_number); - } - } else { - unsigned int end_offset; - // Translate the immutable delta list header into a temporary - // full delta list header - delta_list = &delta_entry->temp_delta_list; - delta_list->start_offset = - get_immutable_start(delta_zone->memory, list_number); - end_offset = get_immutable_start(delta_zone->memory, - list_number + 1); - delta_list->size = end_offset - delta_list->start_offset; - delta_list->save_key = 0; - delta_list->save_offset = 0; - } - - if (key > delta_list->save_key) { - delta_entry->key = delta_list->save_key; - delta_entry->offset = delta_list->save_offset; - } else { - delta_entry->key = 0; - delta_entry->offset = 0; - if (key == 0) { - // This usually means we're about to walk the entire - // delta list, so get all of it into the CPU cache. - prefetch_delta_list(delta_zone, delta_list); - } - } - - delta_entry->at_end = false; - delta_entry->delta_zone = delta_zone; - delta_entry->delta_list = delta_list; - delta_entry->entry_bits = 0; - delta_entry->is_collision = false; - delta_entry->list_number = list_number; - delta_entry->list_overflow = false; - delta_entry->value_bits = delta_zone->value_bits; - return UDS_SUCCESS; -} - -/**********************************************************************/ -noinline int next_delta_index_entry(struct delta_index_entry *delta_entry) -{ - const struct delta_list *delta_list; - unsigned int next_offset, size; - int result = assert_not_at_end(delta_entry, UDS_BAD_STATE); - if (result != UDS_SUCCESS) { - return result; - } - - delta_list = delta_entry->delta_list; - delta_entry->offset += delta_entry->entry_bits; - size = get_delta_list_size(delta_list); - if (unlikely(delta_entry->offset >= size)) { - delta_entry->at_end = true; - delta_entry->delta = 0; - delta_entry->is_collision = false; - return ASSERT_WITH_ERROR_CODE((delta_entry->offset == size), - UDS_CORRUPT_DATA, - "next offset past end of delta list"); - } - - decode_delta(delta_entry); - - next_offset = delta_entry->offset + delta_entry->entry_bits; - if (next_offset > size) { - // This is not an assertion because - // validate_chapter_index_page() wants to handle this error. - uds_log_warning("Decoded past the end of the delta list"); - return UDS_CORRUPT_DATA; - } - return UDS_SUCCESS; -} - -/**********************************************************************/ -int remember_delta_index_offset(const struct delta_index_entry *delta_entry) -{ - struct delta_list *delta_list = delta_entry->delta_list; - int result = - ASSERT(!delta_entry->is_collision, "entry is not a collision"); - if (result != UDS_SUCCESS) { - return result; - } - - delta_list->save_key = delta_entry->key - delta_entry->delta; - delta_list->save_offset = delta_entry->offset; - return UDS_SUCCESS; -} - -/**********************************************************************/ -int get_delta_index_entry(const struct delta_index *delta_index, - unsigned int list_number, - unsigned int key, - const byte *name, - bool read_only, - struct delta_index_entry *delta_entry) -{ - int result = start_delta_index_search(delta_index, list_number, key, - read_only, delta_entry); - if (result != UDS_SUCCESS) { - return result; - } - do { - result = next_delta_index_entry(delta_entry); - if (result != UDS_SUCCESS) { - return result; - } - } while (!delta_entry->at_end && (key > delta_entry->key)); - - result = remember_delta_index_offset(delta_entry); - if (result != UDS_SUCCESS) { - return result; - } - - if (!delta_entry->at_end && (key == delta_entry->key)) { - struct delta_index_entry collision_entry; - collision_entry = *delta_entry; - for (;;) { - byte collision_name[COLLISION_BYTES]; - result = next_delta_index_entry(&collision_entry); - if (result != UDS_SUCCESS) { - return result; - } - if (collision_entry.at_end || - !collision_entry.is_collision) { - break; - } - get_bytes(delta_entry->delta_zone->memory, - get_collision_offset(&collision_entry), - collision_name, - COLLISION_BYTES); - if (memcmp(collision_name, name, COLLISION_BYTES) == - 0) { - *delta_entry = collision_entry; - break; - } - } - } - return UDS_SUCCESS; -} - -/**********************************************************************/ -int get_delta_entry_collision(const struct delta_index_entry *delta_entry, - byte *name) -{ - int result = assert_not_at_end(delta_entry, UDS_BAD_STATE); - if (result != UDS_SUCCESS) { - return result; - } - result = ASSERT_WITH_ERROR_CODE(delta_entry->is_collision, - UDS_BAD_STATE, - "Cannot get full block name from a" - " non-collision delta index entry"); - if (result != UDS_SUCCESS) { - return result; - } - - get_bytes(delta_entry->delta_zone->memory, - get_collision_offset(delta_entry), - name, - COLLISION_BYTES); - return UDS_SUCCESS; -} - -/**********************************************************************/ -static int assert_mutable_entry(const struct delta_index_entry *delta_entry) -{ - return ASSERT_WITH_ERROR_CODE(delta_entry->delta_list != - &delta_entry->temp_delta_list, - UDS_BAD_STATE, - "delta index is mutable"); -} - -/**********************************************************************/ -int set_delta_entry_value(const struct delta_index_entry *delta_entry, - unsigned int value) -{ - int result = assert_mutable_entry(delta_entry); - if (result != UDS_SUCCESS) { - return result; - } - result = assert_not_at_end(delta_entry, UDS_BAD_STATE); - if (result != UDS_SUCCESS) { - return result; - } - - result = ASSERT_WITH_ERROR_CODE(((value & ((1 << delta_entry->value_bits) - 1)) == value), - UDS_INVALID_ARGUMENT, - "Value (%u) being set in a delta index is too large (must fit in %u bits)", - value, - delta_entry->value_bits); - if (result != UDS_SUCCESS) { - return result; - } - - set_field(value, - delta_entry->delta_zone->memory, - get_delta_entry_offset(delta_entry), - delta_entry->value_bits); - return UDS_SUCCESS; -} - -/**********************************************************************/ -int put_delta_index_entry(struct delta_index_entry *delta_entry, - unsigned int key, - unsigned int value, - const byte *name) -{ - struct delta_memory *delta_zone; - int result = assert_mutable_entry(delta_entry); - if (result != UDS_SUCCESS) { - return result; - } - if (delta_entry->is_collision) { - /* - * The caller wants us to insert a collision entry onto a - * collision entry. This happens when we find a collision and - * attempt to add the name again to the index. This is - * normally a fatal error unless we are replaying a closed - * chapter while we are rebuilding a volume index. - */ - return UDS_DUPLICATE_NAME; - } - - if (delta_entry->offset < delta_entry->delta_list->save_offset) { - // The saved entry offset is after the new entry and will no - // longer be valid, so replace it with the insertion point. - result = remember_delta_index_offset(delta_entry); - if (result != UDS_SUCCESS) { - return result; - } - } - - if (name != NULL) { - // We are inserting a collision entry which is placed after - // this entry - result = assert_not_at_end(delta_entry, UDS_BAD_STATE); - if (result != UDS_SUCCESS) { - return result; - } - result = ASSERT((key == delta_entry->key), - "incorrect key for collision entry"); - if (result != UDS_SUCCESS) { - return result; - } - - delta_entry->offset += delta_entry->entry_bits; - set_delta(delta_entry, 0); - set_collision(delta_entry); - result = insert_bits(delta_entry, delta_entry->entry_bits); - } else if (delta_entry->at_end) { - // We are inserting a new entry at the end of the delta list - result = ASSERT((key >= delta_entry->key), - "key past end of list"); - if (result != UDS_SUCCESS) { - return result; - } - - set_delta(delta_entry, key - delta_entry->key); - delta_entry->key = key; - delta_entry->at_end = false; - result = insert_bits(delta_entry, delta_entry->entry_bits); - } else { - int old_entry_size, additional_size; - struct delta_index_entry next_entry; - unsigned int next_value; - // We are inserting a new entry which requires the delta in the - // following entry to be updated. - result = ASSERT((key < delta_entry->key), - "key precedes following entry"); - if (result != UDS_SUCCESS) { - return result; - } - result = ASSERT((key >= delta_entry->key - delta_entry->delta), - "key effects following entry's delta"); - if (result != UDS_SUCCESS) { - return result; - } - - old_entry_size = delta_entry->entry_bits; - next_entry = *delta_entry; - next_value = get_delta_entry_value(&next_entry); - set_delta(delta_entry, - key - (delta_entry->key - delta_entry->delta)); - delta_entry->key = key; - set_delta(&next_entry, next_entry.key - key); - next_entry.offset += delta_entry->entry_bits; - // The 2 new entries are always bigger than the 1 entry we are - // replacing - additional_size = delta_entry->entry_bits + - next_entry.entry_bits - old_entry_size; - result = insert_bits(delta_entry, additional_size); - if (result != UDS_SUCCESS) { - return result; - } - encode_entry(&next_entry, next_value, NULL); - } - if (result != UDS_SUCCESS) { - return result; - } - encode_entry(delta_entry, value, name); - - delta_zone = delta_entry->delta_zone; - delta_zone->record_count++; - delta_zone->collision_count += delta_entry->is_collision ? 1 : 0; - return UDS_SUCCESS; -} - -/**********************************************************************/ -int remove_delta_index_entry(struct delta_index_entry *delta_entry) -{ - struct delta_index_entry next_entry; - struct delta_memory *delta_zone; - struct delta_list *delta_list; - int result = assert_mutable_entry(delta_entry); - if (result != UDS_SUCCESS) { - return result; - } - - next_entry = *delta_entry; - result = next_delta_index_entry(&next_entry); - if (result != UDS_SUCCESS) { - return result; - } - - delta_zone = delta_entry->delta_zone; - - if (delta_entry->is_collision) { - // This is a collision entry, so just remove it - delete_bits(delta_entry, delta_entry->entry_bits); - next_entry.offset = delta_entry->offset; - delta_zone->collision_count -= 1; - } else if (next_entry.at_end) { - // This entry is at the end of the list, so just remove it - delete_bits(delta_entry, delta_entry->entry_bits); - next_entry.key -= delta_entry->delta; - next_entry.offset = delta_entry->offset; - } else { - // The delta in the next entry needs to be updated. - unsigned int next_value = get_delta_entry_value(&next_entry); - int old_size = delta_entry->entry_bits + next_entry.entry_bits; - if (next_entry.is_collision) { - // The next record is a collision. It needs to be - // rewritten as a non-collision with a larger delta. - next_entry.is_collision = false; - delta_zone->collision_count -= 1; - } - set_delta(&next_entry, delta_entry->delta + next_entry.delta); - next_entry.offset = delta_entry->offset; - // The 1 new entry is always smaller than the 2 entries we are - // replacing - delete_bits(delta_entry, old_size - next_entry.entry_bits); - encode_entry(&next_entry, next_value, NULL); - } - delta_zone->record_count--; - delta_zone->discard_count++; - *delta_entry = next_entry; - - delta_list = delta_entry->delta_list; - if (delta_entry->offset < delta_list->save_offset) { - // The saved entry offset is after the entry we just removed - // and it will no longer be valid. We must force the next - // search to start at the beginning. - delta_list->save_key = 0; - delta_list->save_offset = 0; - } - return UDS_SUCCESS; -} - -/**********************************************************************/ -unsigned int -get_delta_index_zone_first_list(const struct delta_index *delta_index, - unsigned int zone_number) -{ - return delta_index->delta_zones[zone_number].first_list; -} - -/**********************************************************************/ -unsigned int -get_delta_index_zone_num_lists(const struct delta_index *delta_index, - unsigned int zone_number) -{ - return delta_index->delta_zones[zone_number].num_lists; -} - -/**********************************************************************/ -uint64_t -get_delta_index_zone_dlist_bits_used(const struct delta_index *delta_index, - unsigned int zone_number) -{ - uint64_t bit_count = 0; - const struct delta_memory *delta_zone = - &delta_index->delta_zones[zone_number]; - unsigned int i; - for (i = 0; i < delta_zone->num_lists; i++) { - bit_count += - get_delta_list_size(&delta_zone->delta_lists[i + 1]); - } - return bit_count; -} - -/**********************************************************************/ -uint64_t get_delta_index_dlist_bits_used(const struct delta_index *delta_index) -{ - uint64_t bit_count = 0; - unsigned int z; - for (z = 0; z < delta_index->num_zones; z++) { - bit_count += - get_delta_index_zone_dlist_bits_used(delta_index, z); - } - return bit_count; -} - -/**********************************************************************/ -uint64_t -get_delta_index_dlist_bits_allocated(const struct delta_index *delta_index) -{ - uint64_t byte_count = 0; - unsigned int z; - for (z = 0; z < delta_index->num_zones; z++) { - const struct delta_memory *delta_zone = - &delta_index->delta_zones[z]; - byte_count += delta_zone->size; - } - return byte_count * CHAR_BIT; -} - -/**********************************************************************/ -void get_delta_index_stats(const struct delta_index *delta_index, - struct delta_index_stats *stats) -{ - unsigned int z; - memset(stats, 0, sizeof(struct delta_index_stats)); - stats->memory_allocated = - delta_index->num_zones * sizeof(struct delta_memory); - for (z = 0; z < delta_index->num_zones; z++) { - const struct delta_memory *delta_zone = - &delta_index->delta_zones[z]; - stats->memory_allocated += - get_delta_memory_allocated(delta_zone); - stats->rebalance_time += delta_zone->rebalance_time; - stats->rebalance_count += delta_zone->rebalance_count; - stats->record_count += delta_zone->record_count; - stats->collision_count += delta_zone->collision_count; - stats->discard_count += delta_zone->discard_count; - stats->overflow_count += delta_zone->overflow_count; - stats->num_lists += delta_zone->num_lists; - } -} - -/**********************************************************************/ -unsigned int get_delta_index_page_count(unsigned int num_entries, - unsigned int num_lists, - unsigned int mean_delta, - unsigned int num_payload_bits, - size_t bytes_per_page) -{ - // Compute the number of bits needed for all the entries - unsigned int bits_per_page; - size_t bits_per_index = - get_delta_memory_size(num_entries, mean_delta, - num_payload_bits); - // Compute the number of bits needed for a single delta list - unsigned int bits_per_delta_list = bits_per_index / num_lists; - // Adjust the bits per index, adding the immutable delta list headers - bits_per_index += num_lists * IMMUTABLE_HEADER_SIZE; - // Compute the number of usable bits on an immutable index page - bits_per_page = - (bytes_per_page - sizeof(struct delta_page_header)) * CHAR_BIT; - // Adjust the bits per page, taking away one immutable delta list header - // and one delta list representing internal fragmentation - bits_per_page -= IMMUTABLE_HEADER_SIZE + bits_per_delta_list; - // Now compute the number of pages needed - return (bits_per_index + bits_per_page - 1) / bits_per_page; -} - -/**********************************************************************/ -void log_delta_index_entry(struct delta_index_entry *delta_entry) -{ - uds_log_ratelimit(uds_log_info, - "List 0x%X Key 0x%X Offset 0x%X%s%s List_size 0x%X%s", - delta_entry->list_number, - delta_entry->key, - delta_entry->offset, - delta_entry->at_end ? " end" : "", - delta_entry->is_collision ? " collision" : "", - get_delta_list_size(delta_entry->delta_list), - delta_entry->list_overflow ? " overflow" : ""); - delta_entry->list_overflow = false; -} diff --git a/uds/deltaIndex.h b/uds/deltaIndex.h deleted file mode 100644 index 6f681b5a..00000000 --- a/uds/deltaIndex.h +++ /dev/null @@ -1,615 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/krusty/src/uds/deltaIndex.h#16 $ - */ - -#ifndef DELTAINDEX_H -#define DELTAINDEX_H 1 - -#include "compiler.h" -#include "deltaMemory.h" - -enum { - // the number of extra bytes and bits needed to store a collision entry - COLLISION_BYTES = UDS_CHUNK_NAME_SIZE, - COLLISION_BITS = COLLISION_BYTES * CHAR_BIT -}; - -struct delta_index { - struct delta_memory *delta_zones; // The zones - unsigned int num_zones; // The number of zones - unsigned int num_lists; // The number of delta lists - unsigned int lists_per_zone; // Lists per zone (last zone can be - // smaller) - bool is_mutable; // True if this index is mutable - byte tag; // Tag belonging to this delta index -}; - -/* - * A delta_index_page describes a single page of a chapter index. The - * delta_index field allows the page to be treated as an immutable delta_index. - * We use the delta_memory field to treat the chapter index page as a single - * zone index, and without the need to do an additional memory allocation. - */ - -struct delta_index_page { - struct delta_index delta_index; - // These values are loaded from the DeltaPageHeader - unsigned int lowest_list_number; - unsigned int highest_list_number; - uint64_t virtual_chapter_number; - // This structure describes the single zone of a delta index page. - struct delta_memory delta_memory; -}; - -/* - * Notes on the delta_index_entries: - * - * The fields documented as "public" can be read by any code that uses a - * delta_index. The fields documented as "private" carry information - * between delta_index method calls and should not be used outside the - * delta_index module. - * - * (1) The delta_index_entry is used like an iterator when searching a delta - * list. - * - * (2) And it is also the result of a successful search and can be used to - * refer to the element found by the search. - * - * (3) And it is also the result of an unsuccessful search and can be used - * to refer to the insertion point for a new record. - * - * (4) If at_end==true, the delta_list entry can only be used as the insertion - * point for a new record at the end of the list. - * - * (5) If at_end==false and is_collision==true, the delta_list entry fields - * refer to a collision entry in the list, and the delta_list entry can - * be used a a reference to this entry. - * - * (6) If at_end==false and is_collision==false, the delta_list entry fields - * refer to a non-collision entry in the list. Such delta_list entries - * can be used as a reference to a found entry, or an insertion point - * for a non-collision entry before this entry, or an insertion point - * for a collision entry that collides with this entry. - */ - -struct delta_index_entry { - // Public fields - unsigned int key; // The key for this entry - bool at_end; // We are after the last entry in - // the list - bool is_collision; // This record is a collision - // Private fields (but DeltaIndex_t1 cheats and looks at them) - bool list_overflow; // This delta list overflowed - unsigned short value_bits; // The number of bits used for - // the value - unsigned short entry_bits; // The number of bits used for - // the entire entry - struct delta_memory *delta_zone; // The delta index zone - struct delta_list *delta_list; // The delta list containing - // the entry - unsigned int list_number; // The delta list number - uint32_t offset; // Bit offset of this entry within - // the list - unsigned int delta; // The delta between this and - // previous entry - struct delta_list temp_delta_list; // Temporary delta list for - // immutable indices -}; - -struct delta_index_stats { - size_t memory_allocated; // Number of bytes allocated - ktime_t rebalance_time; // Nanoseconds spent rebalancing - int rebalance_count; // Number of memory rebalances - long record_count; // The number of records in the index - long collision_count; // The number of collision records - long discard_count; // The number of records removed - long overflow_count; // The number of UDS_OVERFLOWs detected - unsigned int num_lists; // The number of delta lists -}; - -/** - * Initialize a delta index. - * - * @param delta_index The delta index to initialize - * @param num_zones The number of zones in the index - * @param num_lists The number of delta lists in the index - * @param mean_delta The mean delta value - * @param num_payload_bits The number of bits in the payload or value - * @param memory_size The number of bytes in memory for the index - * - * @return error code or UDS_SUCCESS - **/ -int __must_check initialize_delta_index(struct delta_index *delta_index, - unsigned int num_zones, - unsigned int num_lists, - unsigned int mean_delta, - unsigned int num_payload_bits, - size_t memory_size); - -/** - * Initialize an immutable delta index page. - * - * @param delta_index_page The delta index page to initialize - * @param expected_nonce If non-zero, the expected nonce. - * @param mean_delta The mean delta value - * @param num_payload_bits The number of bits in the payload or value - * @param memory The memory page - * @param mem_size The size of the memory page - * - * @return error code or UDS_SUCCESS - **/ -int __must_check -initialize_delta_index_page(struct delta_index_page *delta_index_page, - uint64_t expected_nonce, - unsigned int mean_delta, - unsigned int num_payload_bits, - byte *memory, - size_t mem_size); - -/** - * Uninitialize a delta index. - * - * @param delta_index The delta index to uninitialize - **/ -void uninitialize_delta_index(struct delta_index *delta_index); - -/** - * Empty the delta index. - * - * @param delta_index The delta index being emptied. - **/ -void empty_delta_index(const struct delta_index *delta_index); - -/** - * Empty a zone of the delta index. - * - * @param delta_index The delta index - * @param zone_number The zone being emptied - **/ -void empty_delta_index_zone(const struct delta_index *delta_index, - unsigned int zone_number); - -/** - * Pack delta lists from a mutable delta index into an immutable delta index - * page. A range of delta lists (starting with a specified list index) is - * copied from the mutable delta index into a memory page used in the immutable - * index. The number of lists copied onto the page is returned to the caller. - * - * @param delta_index The delta index being converted - * @param header_nonce The header nonce to store - * @param memory The memory page to use - * @param mem_size The size of the memory page - * @param virtual_chapter_number The virtual chapter number - * @param first_list The first delta list number to be copied - * @param num_lists The number of delta lists that were copied - * - * @return error code or UDS_SUCCESS. On UDS_SUCCESS, the numLists - * argument contains the number of lists copied. - **/ -int __must_check pack_delta_index_page(const struct delta_index *delta_index, - uint64_t header_nonce, - byte *memory, - size_t mem_size, - uint64_t virtual_chapter_number, - unsigned int first_list, - unsigned int *num_lists); - - -/** - * Set the tag value used when saving and/or restoring a delta index. - * - * @param delta_index The delta index - * @param tag The tag value - **/ -void set_delta_index_tag(struct delta_index *delta_index, byte tag); - -/** - * Start restoring a delta index from an input stream. - * - * @param delta_index The delta index to read into - * @param buffered_readers The buffered readers to read the delta index from - * @param num_readers The number of buffered readers - * - * @return UDS_SUCCESS on success, or an error code on failure - **/ -int __must_check -start_restoring_delta_index(const struct delta_index *delta_index, - struct buffered_reader **buffered_readers, - int num_readers); - -/** - * Have all the data been read while restoring a delta index from an - * input stream? - * - * @param delta_index The delta index - * - * @return true if all the data are read - **/ -bool is_restoring_delta_index_done(const struct delta_index *delta_index); - -/** - * Restore a saved delta list - * - * @param delta_index The delta index - * @param dlsi The delta_list_save_info describing the delta list - * @param data The saved delta list bit stream - * - * @return error code or UDS_SUCCESS - **/ -int __must_check -restore_delta_list_to_delta_index(const struct delta_index *delta_index, - const struct delta_list_save_info *dlsi, - const byte data[DELTA_LIST_MAX_BYTE_COUNT]); - -/** - * Abort restoring a delta index from an input stream. - * - * @param delta_index The delta index - **/ -void abort_restoring_delta_index(const struct delta_index *delta_index); - -/** - * Start saving a delta index zone to a buffered output stream. - * - * @param delta_index The delta index - * @param zone_number The zone number - * @param buffered_writer The index state component being written - * - * @return UDS_SUCCESS on success, or an error code on failure - **/ -int __must_check -start_saving_delta_index(const struct delta_index *delta_index, - unsigned int zone_number, - struct buffered_writer *buffered_writer); - -/** - * Have all the data been written while saving a delta index zone to an - * output stream? If the answer is yes, it is still necessary to call - * finish_saving_delta_index(), which will return quickly. - * - * @param delta_index The delta index - * @param zone_number The zone number - * - * @return true if all the data are written - **/ -bool is_saving_delta_index_done(const struct delta_index *delta_index, - unsigned int zone_number); - -/** - * Finish saving a delta index zone to an output stream. Force the writing - * of all of the remaining data. If an error occurred asynchronously - * during the save operation, it will be returned here. - * - * @param delta_index The delta index - * @param zone_number The zone number - * - * @return UDS_SUCCESS on success, or an error code on failure - **/ -int __must_check finish_saving_delta_index(const struct delta_index *delta_index, - unsigned int zone_number); - -/** - * Abort saving a delta index zone to an output stream. If an error - * occurred asynchronously during the save operation, it will be dropped. - * - * @param delta_index The delta index - * @param zone_number The zone number - * - * @return UDS_SUCCESS on success, or an error code on failure - **/ -int __must_check abort_saving_delta_index(const struct delta_index *delta_index, - unsigned int zone_number); - -/** - * Compute the number of bytes required to save a delta index - * - * @param num_lists The number of delta lists in the index - * @param memory_size The number of bytes in memory for the index - * - * @return num_bytes The number of bytes required to save the volume index - **/ -size_t __must_check compute_delta_index_save_bytes(unsigned int num_lists, - size_t memory_size); - -/** - * Validate the delta index - * - * @param delta_index The delta index - * - * @return UDS_SUCCESS on success, or an error code on failure - **/ -int __must_check validate_delta_index(const struct delta_index *delta_index); - -/** - * Prepare to search for an entry in the specified delta list. - * - *

This is always the first routine to be called when dealing with delta - * index entries. It is always followed by calls to next_delta_index_entry to - * iterate through a delta list. The fields of the delta_index_entry argument - * will be set up for iteration, but will not contain an entry from the list. - * - * @param delta_index The delta index to search - * @param list_number The delta list number - * @param key First delta list key that the caller is interested in - * @param read_only True if this is a read-only operation - * @param iterator The index entry being used to search through the list - * - * @return UDS_SUCCESS on success, or an error code on failure - **/ -int __must_check start_delta_index_search(const struct delta_index *delta_index, - unsigned int list_number, - unsigned int key, - bool read_only, - struct delta_index_entry *iterator); - -/** - * Find the next entry in the specified delta list - * - * @param delta_entry Info about an entry, which is updated to describe the - * following entry - * - * @return UDS_SUCCESS on success, or an error code on failure - **/ -int __must_check next_delta_index_entry(struct delta_index_entry *delta_entry); - -/** - * Remember the position of a delta index entry, so that we can use it when - * starting the next search. - * - * @param delta_entry Info about an entry found during a search. This should - * be the first entry that matches the key exactly (i.e. - * not a collision entry), or the first entry with a key - * greater than the entry sought for. - * - * @return UDS_SUCCESS on success, or an error code on failure - **/ -int __must_check -remember_delta_index_offset(const struct delta_index_entry *delta_entry); - -/** - * Find the delta index entry, or the insertion point for a delta index - * entry. - * - * @param delta_index The delta index to search - * @param list_number The delta list number - * @param key The key field being looked for - * @param name The 256 bit full name - * @param read_only True if this is a read-only index search - * @param delta_entry Updated to describe the entry being looked for - * - * @return UDS_SUCCESS or an error code - **/ -int __must_check get_delta_index_entry(const struct delta_index *delta_index, - unsigned int list_number, - unsigned int key, - const byte *name, - bool read_only, - struct delta_index_entry *delta_entry); - -/** - * Get the full name from a collision delta_index_entry - * - * @param delta_entry The delta index record - * @param name The 256 bit full name - * - * @return UDS_SUCCESS or an error code - **/ -int __must_check -get_delta_entry_collision(const struct delta_index_entry *delta_entry, - byte *name); - -/** - * Get the bit offset into delta memory of a delta index entry. - * - * @param delta_entry The delta index entry - * - * @return the bit offset into delta memory - **/ -static INLINE uint64_t -get_delta_entry_offset(const struct delta_index_entry *delta_entry) -{ - return get_delta_list_start(delta_entry->delta_list) + - delta_entry->offset; -} - -/** - * Get the number of bits used to encode the entry key (the delta). - * - * @param entry The delta index record - * - * @return the number of bits used to encode the key - **/ -static INLINE unsigned int -get_delta_entry_key_bits(const struct delta_index_entry *entry) -{ - /* - * Derive keyBits by subtracting the sizes of the other two fields from - * the total. We don't actually use this for encoding/decoding, so it - * doesn't need to be super-fast. We save time where it matters by not - * storing it. - */ - return (entry->entry_bits - entry->value_bits - - (entry->is_collision ? COLLISION_BITS : 0)); -} - -/** - * Get the value field of the delta_index_entry - * - * @param delta_entry The delta index record - * - * @return the value - **/ -static INLINE unsigned int -get_delta_entry_value(const struct delta_index_entry *delta_entry) -{ - return get_field(delta_entry->delta_zone->memory, - get_delta_entry_offset(delta_entry), - delta_entry->value_bits); -} - -/** - * Set the value field of the delta_index_entry - * - * @param delta_entry The delta index record - * @param value The new value - * - * @return UDS_SUCCESS or an error code - **/ -int __must_check -set_delta_entry_value(const struct delta_index_entry *delta_entry, - unsigned int value); - -/** - * Create a new entry in the delta index - * - * @param delta_entry The delta index entry that indicates the insertion point - * for the new record. For a collision entry, this is the - * non-collision entry that the new entry collides with. - * For a non-collision entry, this new entry is inserted - * before the specified entry. - * @param key The key field - * @param value The value field - * @param name For collision entries, the 256 bit full name; - * Otherwise null - * - * @return UDS_SUCCESS or an error code - **/ -int __must_check put_delta_index_entry(struct delta_index_entry *delta_entry, - unsigned int key, - unsigned int value, - const byte *name); - -/** - * Remove an existing delta index entry, and advance to the next entry in - * the delta list. - * - * @param delta_entry On call the delta index record to remove. After - * returning, the following entry in the delta list. - * - * @return UDS_SUCCESS or an error code - **/ -int __must_check -remove_delta_index_entry(struct delta_index_entry *delta_entry); - -/** - * Map a delta list number to a delta zone number - * - * @param delta_index The delta index - * @param list_number The delta list number - * - * @return the zone number containing the delta list - **/ -static INLINE unsigned int -get_delta_index_zone(const struct delta_index *delta_index, - unsigned int list_number) -{ - return list_number / delta_index->lists_per_zone; -} - -/** - * Get the first delta list number in a zone - * - * @param delta_index The delta index - * @param zone_number The zone number - * - * @return the first delta list index in the zone - **/ -unsigned int -get_delta_index_zone_first_list(const struct delta_index *delta_index, - unsigned int zone_number); - -/** - * Get the number of delta lists in a zone - * - * @param delta_index The delta index - * @param zone_number The zone number - * - * @return the number of delta lists in the zone - **/ -unsigned int -get_delta_index_zone_num_lists(const struct delta_index *delta_index, - unsigned int zone_number); - -/** - * Get the number of bytes used for volume index entries in a zone - * - * @param delta_index The delta index - * @param zone_number The zone number - * - * @return The number of bits in use - **/ -uint64_t __must_check -get_delta_index_zone_dlist_bits_used(const struct delta_index *delta_index, - unsigned int zone_number); - -/** - * Get the number of bytes used for volume index entries. - * - * @param delta_index The delta index - * - * @return The number of bits in use - **/ -uint64_t __must_check -get_delta_index_dlist_bits_used(const struct delta_index *delta_index); - -/** - * Get the number of bytes allocated for volume index entries. - * - * @param delta_index The delta index - * - * @return The number of bits allocated - **/ -uint64_t __must_check -get_delta_index_dlist_bits_allocated(const struct delta_index *delta_index); - -/** - * Get the delta index statistics. - * - * @param delta_index The delta index - * @param stats The statistics - **/ -void get_delta_index_stats(const struct delta_index *delta_index, - struct delta_index_stats *stats); - -/** - * Get the number of pages needed for an immutable delta index. - * - * @param num_entries The number of entries in the index - * @param num_lists The number of delta lists - * @param mean_delta The mean delta value - * @param num_payload_bits The number of bits in the payload or value - * @param bytes_per_page The number of bytes in a page - * - * @return the number of pages needed for the index - **/ -unsigned int get_delta_index_page_count(unsigned int num_entries, - unsigned int num_lists, - unsigned int mean_delta, - unsigned int num_payload_bits, - size_t bytes_per_page); - -/** - * Log a delta index entry, and any error conditions related to the entry. - * - * @param delta_entry The delta index entry. - **/ -void log_delta_index_entry(struct delta_index_entry *delta_entry); - -#endif /* DELTAINDEX_H */ diff --git a/uds/deltaMemory.c b/uds/deltaMemory.c deleted file mode 100644 index 294382e0..00000000 --- a/uds/deltaMemory.c +++ /dev/null @@ -1,781 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/krusty/src/uds/deltaMemory.c#25 $ - */ -#include "deltaMemory.h" - -#include "bits.h" -#include "buffer.h" -#include "compiler.h" -#include "errors.h" -#include "hashUtils.h" -#include "logger.h" -#include "memoryAlloc.h" -#include "permassert.h" -#include "timeUtils.h" -#include "typeDefs.h" -#include "uds.h" - -/* - * The delta_memory structure manages the memory that stores delta lists. - * - * The "mutable" form of delta_memory is used for the volume index and for - * an open chapter index. The "immutable" form of delta_memory is used for - * regular chapter indices. - */ - -// This is the number of guard bits that are needed in the tail guard list -enum { GUARD_BITS = POST_FIELD_GUARD_BYTES * CHAR_BIT }; - -/** - * Get the offset of the first byte that a delta list bit stream resides in - * - * @param delta_list The delta list - * - * @return the number byte offset - **/ -static INLINE uint64_t -get_delta_list_byte_start(const struct delta_list *delta_list) -{ - return get_delta_list_start(delta_list) / CHAR_BIT; -} - -/** - * Get the actual number of bytes that a delta list bit stream resides in - * - * @param delta_list The delta list - * - * @return the number of bytes - **/ -static INLINE uint16_t -get_delta_list_byte_size(const struct delta_list *delta_list) -{ - uint16_t start_bit_offset = - get_delta_list_start(delta_list) % CHAR_BIT; - uint16_t bit_size = get_delta_list_size(delta_list); - return ((unsigned int) start_bit_offset + bit_size + CHAR_BIT - 1) / - CHAR_BIT; -} - -/** - * Get the number of bytes in the delta lists headers. - * - * @param num_lists The number of delta lists - * - * @return the number of bytes in the delta lists headers - **/ -static INLINE size_t get_size_of_delta_lists(unsigned int num_lists) -{ - return (num_lists + 2) * sizeof(struct delta_list); -} - -/** - * Get the size of the flags array (in bytes) - * - * @param num_lists The number of delta lists - * - * @return the number of bytes for an array that has one bit per delta - * list, plus the necessary guard bytes. - **/ -static INLINE size_t get_size_of_flags(unsigned int num_lists) -{ - return (num_lists + CHAR_BIT - 1) / CHAR_BIT + POST_FIELD_GUARD_BYTES; -} - -/** - * Get the number of bytes of scratch memory for the delta lists. - * - * @param num_lists The number of delta lists - * - * @return the number of bytes of scratch memory for the delta lists - **/ -static INLINE size_t get_size_of_temp_offsets(unsigned int num_lists) -{ - return (num_lists + 2) * sizeof(uint64_t); -} - -/**********************************************************************/ - -/** - * Clear the transfers flags. - * - * @param delta_memory The delta memory - **/ -static void clear_transfer_flags(struct delta_memory *delta_memory) -{ - memset(delta_memory->flags, - 0, - get_size_of_flags(delta_memory->num_lists)); - delta_memory->num_transfers = 0; - delta_memory->transfer_status = UDS_SUCCESS; -} - -/**********************************************************************/ - -/** - * Set the transfer flags for delta lists that are not empty, and count how - * many there are. - * - * @param delta_memory The delta memory - **/ -static void flag_non_empty_delta_lists(struct delta_memory *delta_memory) -{ - unsigned int i; - clear_transfer_flags(delta_memory); - for (i = 0; i < delta_memory->num_lists; i++) { - if (get_delta_list_size(&delta_memory->delta_lists[i + 1]) > 0) { - set_one(delta_memory->flags, i, 1); - delta_memory->num_transfers++; - } - } -} - -/**********************************************************************/ -void empty_delta_lists(struct delta_memory *delta_memory) -{ - uint64_t num_bits, spacing, offset; - unsigned int i; - // Zero all the delta list headers - struct delta_list *delta_lists = delta_memory->delta_lists; - memset(delta_lists, 0, - get_size_of_delta_lists(delta_memory->num_lists)); - - /* - * Initialize delta lists to be empty. We keep 2 extra delta list - * descriptors, one before the first real entry and one after so that - * we don't need to bounds check the array access when calculating - * preceeding and following gap sizes. - * - * Because the delta list headers were zeroed, the head guard list is - * already at offset zero and size zero. - * - * The end guard list contains guard bytes so that the bit field - * utilities can safely read past the end of any byte we are interested - * in. - */ - num_bits = (uint64_t) delta_memory->size * CHAR_BIT; - delta_lists[delta_memory->num_lists + 1].start_offset = - num_bits - GUARD_BITS; - delta_lists[delta_memory->num_lists + 1].size = GUARD_BITS; - - // Set all the bits in the end guard list. Do not use the bit field - // utilities. - memset(delta_memory->memory + delta_memory->size - - POST_FIELD_GUARD_BYTES, ~0, POST_FIELD_GUARD_BYTES); - - // Evenly space out the real delta lists. The sizes are already zero, - // so we just need to set the starting offsets. - spacing = (num_bits - GUARD_BITS) / delta_memory->num_lists; - offset = spacing / 2; - for (i = 1; i <= delta_memory->num_lists; i++) { - delta_lists[i].start_offset = offset; - offset += spacing; - } - - // Update the statistics - delta_memory->discard_count += delta_memory->record_count; - delta_memory->record_count = 0; - delta_memory->collision_count = 0; -} - -/**********************************************************************/ -/** - * Compute the Huffman coding parameters for the given mean delta - * - * @param mean_delta The mean delta value - * @param min_bits The number of bits in the minimal key code - * @param min_keys The number of keys used in a minimal code - * @param incr_keys The number of keys used for another code bit - **/ -static void compute_coding_constants(unsigned int mean_delta, - unsigned short *min_bits, - unsigned int *min_keys, - unsigned int *incr_keys) -{ - // We want to compute the rounded value of log(2) * mean_delta. Since - // we cannot always use floating point, use a really good integer - // approximation. - *incr_keys = (836158UL * mean_delta + 603160UL) / 1206321UL; - *min_bits = compute_bits(*incr_keys + 1); - *min_keys = (1 << *min_bits) - *incr_keys; -} - -/**********************************************************************/ -/** - * Rebalance a range of delta lists within memory. - * - * @param delta_memory A delta memory structure - * @param first The first delta list index - * @param last The last delta list index - **/ -static void rebalance_delta_memory(const struct delta_memory *delta_memory, - unsigned int first, - unsigned int last) -{ - if (first == last) { - struct delta_list *delta_list = - &delta_memory->delta_lists[first]; - uint64_t new_start = delta_memory->temp_offsets[first]; - // We need to move only one list, and we know it is safe to do - // so - if (get_delta_list_start(delta_list) != new_start) { - uint64_t destination, source; - // Compute the first source byte - source = get_delta_list_byte_start(delta_list); - // Update the delta list location - delta_list->start_offset = new_start; - // Now use the same computation to locate the first - // destination byte - destination = get_delta_list_byte_start(delta_list); - memmove(delta_memory->memory + destination, - delta_memory->memory + source, - get_delta_list_byte_size(delta_list)); - } - } else { - // There is more than one list. Divide the problem in half, - // and use recursive calls to process each half. Note that - // after this computation, first <= middle, and middle < last. - unsigned int middle = (first + last) / 2; - const struct delta_list *delta_list = - &delta_memory->delta_lists[middle]; - uint64_t new_start = delta_memory->temp_offsets[middle]; - // The direction that our middle list is moving determines - // which half of the problem must be processed first. - if (new_start > get_delta_list_start(delta_list)) { - rebalance_delta_memory(delta_memory, middle + 1, last); - rebalance_delta_memory(delta_memory, first, middle); - } else { - rebalance_delta_memory(delta_memory, first, middle); - rebalance_delta_memory(delta_memory, middle + 1, last); - } - } -} - -/**********************************************************************/ -int initialize_delta_memory(struct delta_memory *delta_memory, - size_t size, - unsigned int first_list, - unsigned int num_lists, - unsigned int mean_delta, - unsigned int num_payload_bits) -{ - byte *memory = NULL, *flags = NULL; - uint64_t *temp_offsets = NULL; - int result; - if (num_lists == 0) { - return uds_log_warning_strerror(UDS_INVALID_ARGUMENT, - "cannot initialize delta memory with 0 delta lists"); - } - result = UDS_ALLOCATE(size, byte, "delta list", &memory); - if (result != UDS_SUCCESS) { - return result; - } - result = UDS_ALLOCATE(num_lists + 2, uint64_t, "delta list temp", - &temp_offsets); - if (result != UDS_SUCCESS) { - UDS_FREE(memory); - return result; - } - result = UDS_ALLOCATE(get_size_of_flags(num_lists), byte, - "delta list flags", &flags); - if (result != UDS_SUCCESS) { - UDS_FREE(memory); - UDS_FREE(temp_offsets); - return result; - } - - compute_coding_constants(mean_delta, - &delta_memory->min_bits, - &delta_memory->min_keys, - &delta_memory->incr_keys); - delta_memory->value_bits = num_payload_bits; - delta_memory->memory = memory; - delta_memory->delta_lists = NULL; - delta_memory->temp_offsets = temp_offsets; - delta_memory->flags = flags; - delta_memory->buffered_writer = NULL; - delta_memory->size = size; - delta_memory->rebalance_time = 0; - delta_memory->rebalance_count = 0; - delta_memory->record_count = 0; - delta_memory->collision_count = 0; - delta_memory->discard_count = 0; - delta_memory->overflow_count = 0; - delta_memory->first_list = first_list; - delta_memory->num_lists = num_lists; - delta_memory->num_transfers = 0; - delta_memory->transfer_status = UDS_SUCCESS; - delta_memory->tag = 'm'; - - // Allocate the delta lists. - result = UDS_ALLOCATE(delta_memory->num_lists + 2, struct delta_list, - "delta lists", &delta_memory->delta_lists); - if (result != UDS_SUCCESS) { - uninitialize_delta_memory(delta_memory); - return result; - } - - empty_delta_lists(delta_memory); - return UDS_SUCCESS; -} - -/**********************************************************************/ -void uninitialize_delta_memory(struct delta_memory *delta_memory) -{ - UDS_FREE(delta_memory->flags); - delta_memory->flags = NULL; - UDS_FREE(delta_memory->temp_offsets); - delta_memory->temp_offsets = NULL; - UDS_FREE(delta_memory->delta_lists); - delta_memory->delta_lists = NULL; - UDS_FREE(delta_memory->memory); - delta_memory->memory = NULL; -} - -/**********************************************************************/ -void initialize_delta_memory_page(struct delta_memory *delta_memory, - byte *memory, - size_t size, - unsigned int num_lists, - unsigned int mean_delta, - unsigned int num_payload_bits) -{ - compute_coding_constants(mean_delta, - &delta_memory->min_bits, - &delta_memory->min_keys, - &delta_memory->incr_keys); - delta_memory->value_bits = num_payload_bits; - delta_memory->memory = memory; - delta_memory->delta_lists = NULL; - delta_memory->temp_offsets = NULL; - delta_memory->flags = NULL; - delta_memory->buffered_writer = NULL; - delta_memory->size = size; - delta_memory->rebalance_time = 0; - delta_memory->rebalance_count = 0; - delta_memory->record_count = 0; - delta_memory->collision_count = 0; - delta_memory->discard_count = 0; - delta_memory->overflow_count = 0; - delta_memory->first_list = 0; - delta_memory->num_lists = num_lists; - delta_memory->num_transfers = 0; - delta_memory->transfer_status = UDS_SUCCESS; - delta_memory->tag = 'p'; -} - -/**********************************************************************/ -bool are_delta_memory_transfers_done(const struct delta_memory *delta_memory) -{ - return delta_memory->num_transfers == 0; -} - -/**********************************************************************/ -int start_restoring_delta_memory(struct delta_memory *delta_memory) -{ - struct delta_list *delta_list; - // Extend and balance memory to receive the delta lists - int result = extend_delta_memory(delta_memory, 0, 0, false); - if (result != UDS_SUCCESS) { - return UDS_SUCCESS; - } - - // The tail guard list needs to be set to ones - delta_list = &delta_memory->delta_lists[delta_memory->num_lists + 1]; - set_one(delta_memory->memory, - get_delta_list_start(delta_list), - get_delta_list_size(delta_list)); - - flag_non_empty_delta_lists(delta_memory); - return UDS_SUCCESS; -} - -/**********************************************************************/ -static int __must_check -read_delta_list_save_info(struct buffered_reader *reader, - struct delta_list_save_info *dlsi) -{ - byte buffer[sizeof(struct delta_list_save_info)]; - int result = read_from_buffered_reader(reader, buffer, sizeof(buffer)); - if (result != UDS_SUCCESS) { - return result; - } - dlsi->tag = buffer[0]; - dlsi->bit_offset = buffer[1]; - dlsi->byte_count = get_unaligned_le16(&buffer[2]); - dlsi->index = get_unaligned_le32(&buffer[4]); - return result; -} - -/**********************************************************************/ -int read_saved_delta_list(struct delta_list_save_info *dlsi, - byte data[DELTA_LIST_MAX_BYTE_COUNT], - struct buffered_reader *buffered_reader) -{ - int result = read_delta_list_save_info(buffered_reader, dlsi); - if (result == UDS_END_OF_FILE) { - return UDS_END_OF_FILE; - } - if (result != UDS_SUCCESS) { - return uds_log_warning_strerror(result, - "failed to read delta list data"); - } - if ((dlsi->bit_offset >= CHAR_BIT) || - (dlsi->byte_count > DELTA_LIST_MAX_BYTE_COUNT)) { - return uds_log_warning_strerror(UDS_CORRUPT_COMPONENT, - "corrupt delta list data"); - } - if (dlsi->tag == 'z') { - return UDS_END_OF_FILE; - } - result = read_from_buffered_reader(buffered_reader, data, - dlsi->byte_count); - if (result != UDS_SUCCESS) { - return uds_log_warning_strerror(result, - "failed to read delta list data"); - } - return UDS_SUCCESS; -} - -/**********************************************************************/ -int restore_delta_list(struct delta_memory *delta_memory, - const struct delta_list_save_info *dlsi, - const byte data[DELTA_LIST_MAX_BYTE_COUNT]) -{ - struct delta_list *delta_list; - uint16_t bit_size; - unsigned int byte_count; - unsigned int list_number = dlsi->index - delta_memory->first_list; - if (list_number >= delta_memory->num_lists) { - return uds_log_warning_strerror(UDS_CORRUPT_COMPONENT, - "invalid delta list number %u not in range [%u,%u)", - dlsi->index, - delta_memory->first_list, - delta_memory->first_list + - delta_memory->num_lists); - } - - if (get_field(delta_memory->flags, list_number, 1) == 0) { - return uds_log_warning_strerror(UDS_CORRUPT_COMPONENT, - "unexpected delta list number %u", - dlsi->index); - } - - delta_list = &delta_memory->delta_lists[list_number + 1]; - bit_size = get_delta_list_size(delta_list); - byte_count = ((unsigned int) dlsi->bit_offset + bit_size + CHAR_BIT - 1) / - CHAR_BIT; - if (dlsi->byte_count != byte_count) { - return uds_log_warning_strerror(UDS_CORRUPT_COMPONENT, - "unexpected delta list size %u != %u", - dlsi->byte_count, - byte_count); - } - - move_bits(data, - dlsi->bit_offset, - delta_memory->memory, - get_delta_list_start(delta_list), - bit_size); - set_zero(delta_memory->flags, list_number, 1); - delta_memory->num_transfers--; - return UDS_SUCCESS; -} - -/**********************************************************************/ -void abort_restoring_delta_memory(struct delta_memory *delta_memory) -{ - clear_transfer_flags(delta_memory); - empty_delta_lists(delta_memory); -} - -/**********************************************************************/ -void start_saving_delta_memory(struct delta_memory *delta_memory, - struct buffered_writer *buffered_writer) -{ - flag_non_empty_delta_lists(delta_memory); - delta_memory->buffered_writer = buffered_writer; -} - -/**********************************************************************/ -int finish_saving_delta_memory(struct delta_memory *delta_memory) -{ - unsigned int i; - for (i = 0; - !are_delta_memory_transfers_done(delta_memory) && - (i < delta_memory->num_lists); - - i++) { - lazy_flush_delta_list(delta_memory, i); - } - if (delta_memory->num_transfers > 0) { - delta_memory->transfer_status = - uds_log_warning_strerror(UDS_CORRUPT_DATA, - "Not all delta lists written"); - } - delta_memory->buffered_writer = NULL; - return delta_memory->transfer_status; -} - -/**********************************************************************/ -void abort_saving_delta_memory(struct delta_memory *delta_memory) -{ - clear_transfer_flags(delta_memory); - delta_memory->buffered_writer = NULL; -} - -/**********************************************************************/ -static int __must_check -write_delta_list_save_info(struct buffered_writer *buffered_writer, - struct delta_list_save_info *dlsi) -{ - byte buffer[sizeof(struct delta_list_save_info)]; - buffer[0] = dlsi->tag; - buffer[1] = dlsi->bit_offset; - put_unaligned_le16(dlsi->byte_count, &buffer[2]); - put_unaligned_le32(dlsi->index, &buffer[4]); - return write_to_buffered_writer(buffered_writer, buffer, - sizeof(buffer)); -} - -/**********************************************************************/ -void flush_delta_list(struct delta_memory *delta_memory, - unsigned int flush_index) -{ - struct delta_list *delta_list; - struct delta_list_save_info dlsi; - int result; - ASSERT_LOG_ONLY((get_field(delta_memory->flags, flush_index, 1) != 0), - "flush bit is set"); - set_zero(delta_memory->flags, flush_index, 1); - delta_memory->num_transfers--; - - delta_list = &delta_memory->delta_lists[flush_index + 1]; - dlsi.tag = delta_memory->tag; - dlsi.bit_offset = get_delta_list_start(delta_list) % CHAR_BIT; - dlsi.byte_count = get_delta_list_byte_size(delta_list); - dlsi.index = delta_memory->first_list + flush_index; - - result = write_delta_list_save_info(delta_memory->buffered_writer, - &dlsi); - if (result != UDS_SUCCESS) { - if (delta_memory->transfer_status == UDS_SUCCESS) { - uds_log_warning_strerror(result, - "failed to write delta list memory"); - delta_memory->transfer_status = result; - } - } - result = write_to_buffered_writer(delta_memory->buffered_writer, - delta_memory->memory + get_delta_list_byte_start(delta_list), - dlsi.byte_count); - if (result != UDS_SUCCESS) { - if (delta_memory->transfer_status == UDS_SUCCESS) { - uds_log_warning_strerror(result, - "failed to write delta list memory"); - delta_memory->transfer_status = result; - } - } -} - -/**********************************************************************/ -int write_guard_delta_list(struct buffered_writer *buffered_writer) -{ - int result; - struct delta_list_save_info dlsi; - dlsi.tag = 'z'; - dlsi.bit_offset = 0; - dlsi.byte_count = 0; - dlsi.index = 0; - result = write_to_buffered_writer(buffered_writer, - (const byte *) &dlsi, - sizeof(struct delta_list_save_info)); - if (result != UDS_SUCCESS) { - uds_log_warning_strerror(result, - "failed to write guard delta list"); - } - return result; -} - -/**********************************************************************/ -int extend_delta_memory(struct delta_memory *delta_memory, - unsigned int growing_index, - size_t growing_size, - bool do_copy) -{ - ktime_t start_time; - struct delta_list *delta_lists; - unsigned int i; - size_t used_space, spacing; - if (!is_mutable(delta_memory)) { - return uds_log_error_strerror(UDS_BAD_STATE, - "Attempt to read into an immutable delta list memory"); - } - - start_time = current_time_ns(CLOCK_MONOTONIC); - - // Calculate the amount of space that is in use. Include the space - // that has a planned use. - delta_lists = delta_memory->delta_lists; - used_space = growing_size; - for (i = 0; i <= delta_memory->num_lists + 1; i++) { - used_space += get_delta_list_byte_size(&delta_lists[i]); - } - - if (delta_memory->size < used_space) { - return UDS_OVERFLOW; - } - - // Compute the new offsets of the delta lists - spacing = (delta_memory->size - used_space) / delta_memory->num_lists; - delta_memory->temp_offsets[0] = 0; - for (i = 0; i <= delta_memory->num_lists; i++) { - delta_memory->temp_offsets[i + 1] = - (delta_memory->temp_offsets[i] + - get_delta_list_byte_size(&delta_lists[i]) + spacing); - delta_memory->temp_offsets[i] *= CHAR_BIT; - delta_memory->temp_offsets[i] += - get_delta_list_start(&delta_lists[i]) % CHAR_BIT; - if (i == 0) { - delta_memory->temp_offsets[i + 1] -= spacing / 2; - } - if (i + 1 == growing_index) { - delta_memory->temp_offsets[i + 1] += growing_size; - } - } - delta_memory->temp_offsets[delta_memory->num_lists + 1] = - (delta_memory->size * CHAR_BIT - - get_delta_list_size(&delta_lists[delta_memory->num_lists + 1])); - // When we rebalance the delta list, we will include the end guard list - // in the rebalancing. It contains the end guard data, which must be - // copied. - if (do_copy) { - ktime_t end_time; - rebalance_delta_memory(delta_memory, 1, - delta_memory->num_lists + 1); - end_time = current_time_ns(CLOCK_MONOTONIC); - delta_memory->rebalance_count++; - delta_memory->rebalance_time += - ktime_sub(end_time, start_time); - } else { - for (i = 1; i <= delta_memory->num_lists + 1; i++) { - delta_lists[i].start_offset = - delta_memory->temp_offsets[i]; - } - } - return UDS_SUCCESS; -} - -/**********************************************************************/ -int validate_delta_lists(const struct delta_memory *delta_memory) -{ - uint64_t num_bits; - int num_guard_bits; - unsigned int i; - struct delta_list *delta_lists = delta_memory->delta_lists; - // Validate the delta index fields set by restoring a delta index - if (delta_memory->collision_count > delta_memory->record_count) { - return uds_log_warning_strerror(UDS_BAD_STATE, - "delta index contains more collisions (%ld) than records (%ld)", - delta_memory->collision_count, - delta_memory->record_count); - } - - // Validate the delta lists - if (get_delta_list_start(&delta_lists[0]) != 0) { - return uds_log_warning_strerror(UDS_BAD_STATE, - "the head guard delta list does not start at 0: %llu", - (unsigned long long) get_delta_list_start(&delta_lists[0])); - } - num_bits = - get_delta_list_end(&delta_lists[delta_memory->num_lists + 1]); - if (num_bits != delta_memory->size * CHAR_BIT) { - return uds_log_warning_strerror(UDS_BAD_STATE, - "the tail guard delta list does not end at end of allocated memory: %llu != %llu", - (unsigned long long) num_bits, - (unsigned long long) delta_memory->size * CHAR_BIT); - } - num_guard_bits = - get_delta_list_size(&delta_lists[delta_memory->num_lists + 1]); - if (num_guard_bits < GUARD_BITS) { - return uds_log_warning_strerror(UDS_BAD_STATE, - "the tail guard delta list does not contain sufficient guard bits: %d < %d", - num_guard_bits, - GUARD_BITS); - } - for (i = 0; i <= delta_memory->num_lists + 1; i++) { - if (get_delta_list_start(&delta_lists[i]) > - get_delta_list_end(&delta_lists[i])) { - return uds_log_warning_strerror(UDS_BAD_STATE, - "invalid delta list %u: (%llu, %llu)", - i, - (unsigned long long) get_delta_list_start(&delta_lists[i]), - (unsigned long long) get_delta_list_end(&delta_lists[i])); - } - if (i > delta_memory->num_lists) { - // The rest of the checks do not apply to the tail guard - // list - continue; - } - if (get_delta_list_end(&delta_lists[i]) > - get_delta_list_start(&delta_lists[i + 1])) { - return uds_log_warning_strerror(UDS_BAD_STATE, - "delta lists %u and %u overlap: %llu > %llu", - i, i + 1, - (unsigned long long) get_delta_list_end(&delta_lists[i]), - (unsigned long long) get_delta_list_start(&delta_lists[i + 1])); - } - if (i == 0) { - // The rest of the checks do not apply to the head guard - // list - continue; - } - if (delta_lists[i].save_offset > - get_delta_list_size(&delta_lists[i])) { - return uds_log_warning_strerror(UDS_BAD_STATE, - "delta lists %u saved offset is larger than the list: %u > %u", - i, - delta_lists[i].save_offset, - get_delta_list_size(&delta_lists[i])); - } - } - - return UDS_SUCCESS; -} - -/**********************************************************************/ -size_t get_delta_memory_allocated(const struct delta_memory *delta_memory) -{ - return (delta_memory->size + - get_size_of_delta_lists(delta_memory->num_lists) + - get_size_of_flags(delta_memory->num_lists) + - get_size_of_temp_offsets(delta_memory->num_lists)); -} - -/**********************************************************************/ -size_t get_delta_memory_size(unsigned long num_entries, - unsigned int mean_delta, - unsigned int num_payload_bits) -{ - unsigned short min_bits; - unsigned int incr_keys, min_keys; - compute_coding_constants(mean_delta, &min_bits, &min_keys, &incr_keys); - // On average, each delta is encoded into about min_bits+1.5 bits. - return (num_entries * (num_payload_bits + min_bits + 1) + - num_entries / 2); -} diff --git a/uds/deltaMemory.h b/uds/deltaMemory.h deleted file mode 100644 index 72af7147..00000000 --- a/uds/deltaMemory.h +++ /dev/null @@ -1,398 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/krusty/src/uds/deltaMemory.h#13 $ - */ - -#ifndef DELTAMEMORY_H -#define DELTAMEMORY_H 1 - -#include "bits.h" -#include "bufferedReader.h" -#include "bufferedWriter.h" -#include "compiler.h" -#include "cpu.h" -#include "timeUtils.h" - -/* - * We encode the delta list information into 16 bytes per list. - * - * Because the volume index has 1 million delta lists, each byte of header - * information ends up costing us 1MB. We have an incentive to keep the - * size down. - * - * The volume index delta list memory is currently about 780MB in size, - * which is more than 6 gigabits. Therefore we need at least 33 bits to - * address the volume index memory and we use the uint64_t type. - * - * The volume index delta lists have 256 entries of about 24 bits each, - * which is 6K bits. The index needs 13 bits to represent the size of a - * delta list and we use the uint16_t type. - */ - -struct delta_list { - uint64_t start_offset; // The offset of the delta list start within - // memory - uint16_t size; // The number of bits in the delta list - uint16_t save_offset; // Where the last search "found" the key - unsigned int save_key; // The key for the record just before - // save_offset. -}; - -struct delta_memory { - byte *memory; // The delta list memory - struct delta_list *delta_lists; // The delta list headers - uint64_t *temp_offsets; // Temporary starts of delta - // lists - byte *flags; // Transfer flags - struct buffered_writer *buffered_writer; // Buffered writer for saving - // an index - size_t size; // The size of delta list - // memory - ktime_t rebalance_time; // Nanoseconds spent - // rebalancing - int rebalance_count; // Number of memory - // rebalances - unsigned short value_bits; // The number of bits of - // value - unsigned short min_bits; // The number of bits in the - // minimal key code - unsigned int min_keys; // The number of keys used in - // a minimal code - unsigned int incr_keys; // The number of keys used - // for another code bit - long record_count; // The number of records in - // the index - long collision_count; // The number of collision - // records - long discard_count; // The number of records - // removed - long overflow_count; // The number of - // UDS_OVERFLOWs detected - unsigned int first_list; // The index of the first - // delta list - unsigned int num_lists; // The number of delta lists - unsigned int num_transfers; // Number of transfer flags - // that are set - int transfer_status; // Status of the transfers in - // progress - byte tag; // Tag belonging to this - // delta index -} __attribute__((aligned(CACHE_LINE_BYTES))); - -struct delta_list_save_info { - uint8_t tag; // Tag identifying which delta index this list - // is in - uint8_t bit_offset; // Bit offset of the start of the list data - uint16_t byte_count; // Number of bytes of list data - uint32_t index; // The delta list number within the delta index -}; - -// The maximum size of a single delta list (in bytes). We add guard bytes -// to this because such a buffer can be used with move_bits. -enum { - DELTA_LIST_MAX_BYTE_COUNT = - ((UINT16_MAX + CHAR_BIT) / CHAR_BIT + POST_FIELD_GUARD_BYTES) -}; - -/** - * Initialize delta list memory. - * - * @param delta_memory A delta memory structure - * @param size The initial size of the memory array - * @param first_list The index of the first delta list - * @param num_lists The number of delta lists - * @param mean_delta The mean delta - * @param num_payload_bits The number of payload bits - * - * @return error code or UDS_SUCCESS - **/ -int __must_check initialize_delta_memory(struct delta_memory *delta_memory, - size_t size, - unsigned int first_list, - unsigned int num_lists, - unsigned int mean_delta, - unsigned int num_payload_bits); - -/** - * Uninitialize delta list memory. - * - * @param delta_memory A delta memory structure - **/ -void uninitialize_delta_memory(struct delta_memory *delta_memory); - -/** - * Initialize delta list memory to refer to a cached page. - * - * @param delta_memory A delta memory structure - * @param memory The memory page - * @param size The size of the memory page - * @param num_lists The number of delta lists - * @param mean_delta The mean delta - * @param num_payload_bits The number of payload bits - **/ -void initialize_delta_memory_page(struct delta_memory *delta_memory, - byte *memory, - size_t size, - unsigned int num_lists, - unsigned int mean_delta, - unsigned int num_payload_bits); - -/** - * Empty the delta lists. - * - * @param delta_memory The delta memory - **/ -void empty_delta_lists(struct delta_memory *delta_memory); - -/** - * Is there a delta list memory save or restore in progress? - * - * @param delta_memory A delta memory structure - * - * @return true if there are no delta lists that need to be saved or - * restored - **/ -bool are_delta_memory_transfers_done(const struct delta_memory *delta_memory); - -/** - * Start restoring delta list memory from a file descriptor - * - * @param delta_memory A delta memory structure - * - * @return error code or UDS_SUCCESS - **/ -int __must_check -start_restoring_delta_memory(struct delta_memory *delta_memory); - -/** - * Read a saved delta list from a file descriptor - * - * @param dlsi The delta_list_save_info describing the delta list - * @param data The saved delta list bit stream - * @param buffered_reader The buffered reader to read the delta list from - * - * @return error code or UDS_SUCCESS - * or UDS_END_OF_FILE at end of the data stream - **/ -int __must_check read_saved_delta_list(struct delta_list_save_info *dlsi, - byte data[DELTA_LIST_MAX_BYTE_COUNT], - struct buffered_reader *buffered_reader); - -/** - * Restore a saved delta list - * - * @param delta_memory A delta memory structure - * @param dlsi The delta_list_save_info describing the delta list - * @param data The saved delta list bit stream - * - * @return error code or UDS_SUCCESS - **/ -int __must_check restore_delta_list(struct delta_memory *delta_memory, - const struct delta_list_save_info *dlsi, - const byte data[DELTA_LIST_MAX_BYTE_COUNT]); - -/** - * Abort restoring delta list memory from an input stream. - * - * @param delta_memory A delta memory structure - **/ -void abort_restoring_delta_memory(struct delta_memory *delta_memory); - -/** - * Start saving delta list memory to a buffered output stream - * - * @param delta_memory A delta memory structure - * @param buffered_writer The index state component being written - **/ -void start_saving_delta_memory(struct delta_memory *delta_memory, - struct buffered_writer *buffered_writer); - -/** - * Finish saving delta list memory to an output stream. Force the writing - * of all of the remaining data. If an error occurred asynchronously - * during the save operation, it will be returned here. - * - * @param delta_memory A delta memory structure - * - * @return error code or UDS_SUCCESS - **/ -int __must_check finish_saving_delta_memory(struct delta_memory *delta_memory); - -/** - * Abort saving delta list memory to an output stream. If an error - * occurred asynchronously during the save operation, it will be dropped. - * - * @param delta_memory A delta memory structure - **/ -void abort_saving_delta_memory(struct delta_memory *delta_memory); - -/** - * Flush a delta list to an output stream - * - * @param delta_memory A delta memory structure - * @param flush_index Index of the delta list that may need to be flushed. - **/ -void flush_delta_list(struct delta_memory *delta_memory, - unsigned int flush_index); - -/** - * Write a guard delta list to mark the end of the saved data - * - * @param buffered_writer The buffered writer to write the guard delta list to - * - * @return error code or UDS_SUCCESS - **/ -int __must_check -write_guard_delta_list(struct buffered_writer *buffered_writer); - -/** - * Extend the memory used by the delta lists and rebalance the lists in the - * new chunk. - * - *

The delta memory contains N delta lists, which are guarded by two - * empty delta lists. The valid delta lists are numbered 1 to N, and the - * guards are numbered 0 and (N+1); - * - *

When the delta lista are bit streams, it is possible that the tail - * of list J and the head of list (J+1) are in the same byte. In this case - * old_offsets[j]+sizes[j]==old_offset[j]-1. We handle this correctly. - * - * @param delta_memory A delta memory structure - * @param growing_index Index of the delta list that needs additional space - * left before it (from 1 to N+1). - * @param growing_size Number of additional bytes needed before growing_index - * @param do_copy True to copy the data, False to just balance the space - * - * @return UDS_SUCCESS or an error code - **/ -int __must_check extend_delta_memory(struct delta_memory *delta_memory, - unsigned int growing_index, - size_t growing_size, - bool do_copy); - -/** - * Validate the delta list headers. - * - * @param delta_memory A delta memory structure - * - * @return UDS_SUCCESS or an error code - **/ -int __must_check validate_delta_lists(const struct delta_memory *delta_memory); - -/** - * Get the number of bytes allocated for delta index entries and any - * associated overhead. - * - * @param delta_memory A delta memory structure - * - * @return The number of bytes allocated - **/ -size_t get_delta_memory_allocated(const struct delta_memory *delta_memory); - -/** - * Get the expected number of bits used in a delta index - * - * @param num_entries The number of index entries - * @param mean_delta The mean delta value - * @param num_payload_bits The number of bits in the payload or value - * - * @return The expected size of a delta index in bits - **/ -size_t __must_check get_delta_memory_size(unsigned long num_entries, - unsigned int mean_delta, - unsigned int num_payload_bits); - -/** - * Get the bit offset to the start of the delta list bit stream - * - * @param delta_list The delta list header - * - * @return the start of the delta list - **/ -static INLINE uint64_t -get_delta_list_start(const struct delta_list *delta_list) -{ - return delta_list->start_offset; -} - -/** - * Get the number of bits in a delta list bit stream - * - * @param delta_list The delta list header - * - * @return the size of the delta list - **/ -static INLINE uint16_t -get_delta_list_size(const struct delta_list *delta_list) -{ - return delta_list->size; -} - -/** - * Get the bit offset to the end of the delta list bit stream - * - * @param delta_list The delta list header - * - * @return the end of the delta list - **/ -static INLINE uint64_t get_delta_list_end(const struct delta_list *delta_list) -{ - return get_delta_list_start(delta_list) + - get_delta_list_size(delta_list); -} - -/** - * Identify mutable vs. immutable delta memory - * - * Mutable delta memory contains delta lists that can be modified, and is - * initialized using initialize_delta_memory(). - * - * Immutable delta memory contains packed delta lists, cannot be modified, - * and is initialized using initialize_delta_memory_page(). - * - * For mutable delta memory, all of the following expressions are true. - * And for immutable delta memory, all of the following expressions are - * false. - * delta_lists != NULL - * temp_offsets != NULL - * flags != NULL - * - * @param delta_memory A delta memory structure - * - * @return true if the delta memory is mutable - **/ -static INLINE bool is_mutable(const struct delta_memory *delta_memory) -{ - return delta_memory->delta_lists != NULL; -} - -/** - * Lazily flush a delta list to an output stream - * - * @param delta_memory A delta memory structure - * @param flush_index Index of the delta list that may need to be flushed. - **/ -static INLINE void lazy_flush_delta_list(struct delta_memory *delta_memory, - unsigned int flush_index) -{ - if (get_field(delta_memory->flags, flush_index, 1) != 0) { - flush_delta_list(delta_memory, flush_index); - } -} -#endif /* DELTAMEMORY_H */ diff --git a/uds/errors.h b/uds/errors.h deleted file mode 100644 index 719e80d8..00000000 --- a/uds/errors.h +++ /dev/null @@ -1,167 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/krusty/src/uds/errors.h#16 $ - */ - -#ifndef ERRORS_H -#define ERRORS_H - -#include "compiler.h" -#include "typeDefs.h" - -/** - * Valid return status codes for internal UDS functions. - **/ -enum uds_status_codes { - /** Successful return */ - UDS_SUCCESS = 0, - - /** Used as a base value for reporting internal errors */ - UDS_ERROR_CODE_BASE = 1024, - /** Index overflow */ - UDS_OVERFLOW = UDS_ERROR_CODE_BASE + 0, - /** Invalid argument passed to internal routine */ - UDS_INVALID_ARGUMENT = UDS_ERROR_CODE_BASE + 1, - /** UDS data structures are in an invalid state */ - UDS_BAD_STATE = UDS_ERROR_CODE_BASE + 2, - /** Attempt to enter the same name into an internal structure twice */ - UDS_DUPLICATE_NAME = UDS_ERROR_CODE_BASE + 3, - /** An internal protocol violation between system components */ - UDS_UNEXPECTED_RESULT = UDS_ERROR_CODE_BASE + 4, - /** An assertion failed */ - UDS_ASSERTION_FAILED = UDS_ERROR_CODE_BASE + 5, - /** Not an actual error, but reporting that the result will be - * delayed */ - UDS_QUEUED = UDS_ERROR_CODE_BASE + 6, - /** A problem has occured with a buffer */ - UDS_BUFFER_ERROR = UDS_ERROR_CODE_BASE + 7, - /** No directory was found where one was expected */ - UDS_NO_DIRECTORY = UDS_ERROR_CODE_BASE + 8, - /** Checkpoint not completed */ - UDS_CHECKPOINT_INCOMPLETE = UDS_ERROR_CODE_BASE + 9, - /** This error range has already been registered */ - UDS_ALREADY_REGISTERED = UDS_ERROR_CODE_BASE + 10, - /** Either read-only or write-only */ - UDS_BAD_IO_DIRECTION = UDS_ERROR_CODE_BASE + 11, - /** Cannot do I/O at this offset */ - UDS_INCORRECT_ALIGNMENT = UDS_ERROR_CODE_BASE + 12, - /** Attempt to read or write data outside the bounds established for - * it */ - UDS_OUT_OF_RANGE = UDS_ERROR_CODE_BASE + 13, - /** Could not load scanner modules */ - UDS_EMODULE_LOAD = UDS_ERROR_CODE_BASE + 14, - /** The specified library context is disabled */ - UDS_DISABLED = UDS_ERROR_CODE_BASE + 15, - /** Some saved index component is corrupt */ - UDS_CORRUPT_COMPONENT = UDS_ERROR_CODE_BASE + 16, - UDS_CORRUPT_FILE = UDS_CORRUPT_COMPONENT, - /** Unknown error */ - UDS_UNKNOWN_ERROR = UDS_ERROR_CODE_BASE + 17, - /** The index configuration or volume format is no longer supported */ - UDS_UNSUPPORTED_VERSION = UDS_ERROR_CODE_BASE + 18, - /** Index data in memory is corrupt */ - UDS_CORRUPT_DATA = UDS_ERROR_CODE_BASE + 19, - /** Short read due to truncated file */ - UDS_SHORT_READ = UDS_ERROR_CODE_BASE + 20, - /** Internal resource limits exceeded */ - UDS_RESOURCE_LIMIT_EXCEEDED = UDS_ERROR_CODE_BASE + 21, - /** Memory overflow due to storage failure */ - UDS_VOLUME_OVERFLOW = UDS_ERROR_CODE_BASE + 22, - /** Essential files for index not found */ - UDS_NO_INDEX = UDS_ERROR_CODE_BASE + 23, - /** Premature end of file in scanned file */ - UDS_END_OF_FILE = UDS_ERROR_CODE_BASE + 24, - /** Attempt to access unsaved index */ - UDS_INDEX_NOT_SAVED_CLEANLY = UDS_ERROR_CODE_BASE + 25, - /** One more than the last UDS_INTERNAL error code */ - UDS_ERROR_CODE_LAST, - /** One more than the last error this block will ever use */ - UDS_ERROR_CODE_BLOCK_END = UDS_ERROR_CODE_BASE + 440 -}; - -enum { - ERRBUF_SIZE = 128 // default size for buffer passed to string_error -}; - -const char *string_error(int errnum, char *buf, size_t buflen); -const char *string_error_name(int errnum, char *buf, size_t buflen); - -/* - * Identify that an result code is a successful result. - * - * @param result A result code - * - * @return true if the result represents a success. - */ -static INLINE bool __must_check is_successful(int result) -{ - return (result == UDS_SUCCESS) || (result == UDS_QUEUED); -} - -struct error_info { - const char *name; - const char *message; -}; - -/** - * Given an error code, return a value acceptable to the kernel. The input - * error code may be a system-generated value (such as -EIO), or an internal - * UDS status code; the result will be a negative errno value. - * - * @param error The error code to convert - * - * @return a system error code value - **/ -int uds_map_to_system_error(int error); - -/** - * Register an error code block for string_error and string_error_name. - * - * @param block_name the name of the block of error codes - * @param first_error the first error code in the block - * @param last_reserved_error one past the highest possible error in the block - * @param infos a pointer to the error info array for the block - * @param info_size the size of the error info array, which - * determines the last actual error for which - * information is available - * - * @return a success or error code, particularly UDS_DUPLICATE_NAME if the - * block name is already present, or UDS_ALREADY_REGISTERED if a - * block with the specified error code is present - **/ -int register_error_block(const char *block_name, - int first_error, - int last_reserved_error, - const struct error_info *infos, - size_t info_size); - -/** - * Return the first error between result1 and result2. - * - * @param result1 A success or error code. - * @param result2 A success or error code. - * - * @return result1 if that is an error, else result2 - **/ -static INLINE int first_error(int result1, int result2) -{ - return result1 == UDS_SUCCESS ? result2 : result1; -} - -#endif /* ERRORS_H */ diff --git a/uds/geometry.h b/uds/geometry.h deleted file mode 100644 index cf48b7fa..00000000 --- a/uds/geometry.h +++ /dev/null @@ -1,285 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/krusty/src/uds/geometry.h#17 $ - */ - -#ifndef GEOMETRY_H -#define GEOMETRY_H 1 - -#include "compiler.h" -#include "typeDefs.h" -#include "uds.h" - -/** - * geometry defines constants and a record that parameterize the - * layout of a UDS index volume. - * - *

An index volume is divided into a fixed number of fixed-size - * chapters, each consisting of a fixed number of fixed-size - * pages. The volume layout is defined by two assumptions and four - * parameters. The assumptions (constants) are that index records are - * 32 bytes (16-byte block name plus 16-byte metadata) and that open - * chapter index hash slots are one byte long. The four parameters are - * the number of bytes in a page, the number of chapters in a volume, - * the number of record pages in a chapter, and the number of chapters - * that are sparse. From these parameters, we derive the rest of the - * layout and derived properties, ranging from the number of pages in - * a chapter to the number of records in the volume. - * - *

The index volume is sized by its memory footprint. For a dense - * index, the persistent storage is about 10 times the size of the - * memory footprint. For a sparse index, the persistent storage is - * about 100 times the size of the memory footprint. - * - *

For a small index with a memory footprint less than 1GB, there - * are three possible memory configurations: 0.25GB, 0.5GB and - * 0.75GB. The default geometry for each is 1024 index records per 32 - * KB page, 1024 chapters per volume, and either 64, 128, or 192 - * record pages per chapter and 6, 13, or 20 index pages per chapter - * depending on the memory configuration. For a 0.25 GB index as - * commonly used with small VDO volumes, this yields a deduplication - * window of 256 GB using about 2.5 GB for the persistent storage and - * 256 MB of RAM. - * - *

For a large index, that is one with a memory footprint that is a - * multiple of one GB, the geometry is 1024 index records per 32 KB - * page, 256 record pages per chapter, 26 index pages per chapter, and - * 1024 chapters for every GB of memory footprint. For a one GB - * volume, this yields a deduplication window of 1 TB using about 9GB - * of persistent storage and 1 GB of RAM. - * - *

For all sizes, the default is zero sparse chapters. A sparse - * volume has about 10 times the deduplication window using 10 times - * as much persistent storage as the equivalent non-sparse volume with - * the same memory footprint. - * - *

If the number of chapters per volume has been reduced by one by - * eliminating physical chapter 0, the virtual chapter that formerly - * mapped to physical chapter 0 may be remapped to another physical - * chapter. This remapping is expressed by storing which virtual - * chapter was remapped, and which physical chapter it was moved to. - **/ -struct geometry { - /** Length of a page in a chapter, in bytes */ - size_t bytes_per_page; - /** Number of record pages in a chapter */ - unsigned int record_pages_per_chapter; - /** Number of (total) chapters in a volume */ - unsigned int chapters_per_volume; - /** Number of sparsely-indexed chapters in a volume */ - unsigned int sparse_chapters_per_volume; - /** Number of bits used to determine delta list numbers */ - unsigned int chapter_delta_list_bits; - /** Virtual chapter remapped from physical chapter 0 */ - uint64_t remapped_virtual; - /** New physical chapter which remapped chapter was moved to */ - uint64_t remapped_physical; - // These are derived properties, expressed as fields for convenience. - /** Total number of pages in a volume, excluding header */ - unsigned int pages_per_volume; - /** Total number of header pages per volume */ - unsigned int header_pages_per_volume; - /** Total number of bytes in a volume, including header */ - size_t bytes_per_volume; - /** Total number of bytes in a chapter */ - size_t bytes_per_chapter; - /** Number of pages in a chapter */ - unsigned int pages_per_chapter; - /** Number of index pages in a chapter index */ - unsigned int index_pages_per_chapter; - /** The minimum ratio of hash slots to records in an open chapter */ - unsigned int open_chapter_load_ratio; - /** Number of records that fit on a page */ - unsigned int records_per_page; - /** Number of records that fit in a chapter */ - unsigned int records_per_chapter; - /** Number of records that fit in a volume */ - uint64_t records_per_volume; - /** Number of delta lists per chapter index */ - unsigned int delta_lists_per_chapter; - /** Mean delta in chapter indexes */ - unsigned int chapter_mean_delta; - /** Number of bits needed for record page numbers */ - unsigned int chapter_payload_bits; - /** Number of bits used to compute addresses for chapter delta lists */ - unsigned int chapter_address_bits; - /** Number of densely-indexed chapters in a volume */ - unsigned int dense_chapters_per_volume; -}; - -enum { - /* The number of bytes in a record (name + metadata) */ - BYTES_PER_RECORD = (UDS_CHUNK_NAME_SIZE + UDS_METADATA_SIZE), - - /* The default length of a page in a chapter, in bytes */ - DEFAULT_BYTES_PER_PAGE = 1024 * BYTES_PER_RECORD, - - /* The default maximum number of records per page */ - DEFAULT_RECORDS_PER_PAGE = DEFAULT_BYTES_PER_PAGE / BYTES_PER_RECORD, - - /** The default number of record pages in a chapter */ - DEFAULT_RECORD_PAGES_PER_CHAPTER = 256, - - /** The default number of record pages in a chapter for a small index */ - SMALL_RECORD_PAGES_PER_CHAPTER = 64, - - /** The default number of chapters in a volume */ - DEFAULT_CHAPTERS_PER_VOLUME = 1024, - - /** The default number of sparsely-indexed chapters in a volume */ - DEFAULT_SPARSE_CHAPTERS_PER_VOLUME = 0, - - /** The log2 of the default mean delta */ - DEFAULT_CHAPTER_MEAN_DELTA_BITS = 16, - - /** The log2 of the number of delta lists in a large chapter */ - DEFAULT_CHAPTER_DELTA_LIST_BITS = 12, - - /** The log2 of the number of delta lists in a small chapter */ - SMALL_CHAPTER_DELTA_LIST_BITS = 10, - - /** The default min ratio of slots to records in an open chapter */ - DEFAULT_OPEN_CHAPTER_LOAD_RATIO = 2, - - /** Checkpoint every n chapters written. Default is to not checkpoint */ - DEFAULT_CHECKPOINT_FREQUENCY = 0 -}; - -/** - * Allocate and initialize all fields of a volume geometry using the - * specified layout parameters. - * - * @param bytes_per_page The length of a page in a chapter, in - * bytes - * @param record_pages_per_chapter The number of pages in a chapter - * @param chapters_per_volume The number of chapters in a volume - * @param sparse_chapters_per_volume The number of sparse chapters in a volume - * @param remapped_virtual The remapped virtual chapter - * @param remapped_physical The physical chapter remapped to - * @param geometry_ptr A pointer to hold the new geometry - * - * @return UDS_SUCCESS or an error code - **/ -int __must_check make_geometry(size_t bytes_per_page, - unsigned int record_pages_per_chapter, - unsigned int chapters_per_volume, - unsigned int sparse_chapters_per_volume, - uint64_t remapped_virtual, - uint64_t remapped_physical, - struct geometry **geometry_ptr); - -/** - * Allocate a new geometry and initialize it with the same parameters as an - * existing geometry. - * - * @param source The geometry record to copy - * @param geometry_ptr A pointer to hold the new geometry - * - * @return UDS_SUCCESS or an error code - **/ -int __must_check copy_geometry(struct geometry *source, - struct geometry **geometry_ptr); - -/** - * Clean up a geometry and its memory. - * - * @param geometry The geometry record to free - **/ -void free_geometry(struct geometry *geometry); - -/** - * Map a virtual chapter number to a physical chapter number - * - * @param geometry The geometry - * @param virtual_chapter The virtual chapter number - * - * @return the corresponding physical chapter number - **/ -unsigned int __must_check -map_to_physical_chapter(const struct geometry *geometry, - uint64_t virtual_chapter); - -/** - * Check whether this geometry is reduced by a chapter - * - * @param geometry The geometry to check - * - * @return true if this geometry is reduced by a chapter - **/ -static INLINE bool __must_check -is_reduced_geometry(const struct geometry *geometry) -{ - return !!(geometry->chapters_per_volume & 1); -} - -/** - * Check whether this geometry is for a sparse index. - * - * @param geometry The geometry to check - * - * @return true if this geometry has sparse chapters - **/ -static INLINE bool __must_check is_sparse(const struct geometry *geometry) -{ - return (geometry->sparse_chapters_per_volume > 0); -} - -/** - * Check whether any sparse chapters have been filled. - * - * @param geometry The geometry of the index - * @param oldest_virtual_chapter The number of the oldest chapter in the - * index - * @param newest_virtual_chapter The number of the newest chapter in the - * index - * - * @return true if the index has filled at least one sparse chapter - **/ -bool __must_check has_sparse_chapters(const struct geometry *geometry, - uint64_t oldest_virtual_chapter, - uint64_t newest_virtual_chapter); - -/** - * Check whether a chapter is sparse or dense. - * - * @param geometry The geometry of the index containing the - * chapter - * @param oldest_virtual_chapter The number of the oldest chapter in the index - * @param newest_virtual_chapter The number of the newest chapter in the index - * @param virtual_chapter_number The number of the chapter to check - * - * @return true if the chapter is sparse - **/ -bool __must_check is_chapter_sparse(const struct geometry *geometry, - uint64_t oldest_virtual_chapter, - uint64_t newest_virtual_chapter, - uint64_t virtual_chapter_number); - -/** - * Calculate how many chapters to expire after opening the newest chapter. - * - * @param geometry The geometry of the index - * @param newest_chapter The newest virtual chapter number - * - * @return The number of oldest chapters to expire - **/ -unsigned int __must_check chapters_to_expire(const struct geometry *geometry, - uint64_t newest_chapter); - -#endif /* GEOMETRY_H */ diff --git a/uds/hashUtils.c b/uds/hashUtils.c deleted file mode 100644 index 3b62540c..00000000 --- a/uds/hashUtils.c +++ /dev/null @@ -1,107 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/krusty/src/uds/hashUtils.c#14 $ - */ - -#include "hashUtils.h" - -#include "errors.h" -#include "logger.h" -#include "permassert.h" -#include "stringUtils.h" -#include "uds.h" - -/** - * Convert a byte string to the hex representation. - * - * @param data binary data to convert - * @param data_len length of binary data - * @param hex target to write hex string into - * @param hex_len capacity of target string - * - * @return UDS_SUCCESS, - * or UDS_INVALID_ARGUMENT if hex_len - * is too short. - **/ -static int data_to_hex(const unsigned char *data, - size_t data_len, - char *hex, - size_t hex_len) -{ - size_t i; - if (hex_len < 2 * data_len + 1) { - return uds_log_warning_strerror(UDS_INVALID_ARGUMENT, - "hex data incorrect size"); - } - for (i = 0; i < data_len; ++i) { - int rc = uds_fixed_sprintf(__func__, - &hex[2 * i], - hex_len - (2 * i), - UDS_INVALID_ARGUMENT, - "%02X", - data[i]); - - if (rc != UDS_SUCCESS) { - return rc; - } - } - return UDS_SUCCESS; -} - -/**********************************************************************/ -int chunk_name_to_hex(const struct uds_chunk_name *chunk_name, - char *hex_data, - size_t hex_data_len) -{ - return data_to_hex(chunk_name->name, UDS_CHUNK_NAME_SIZE, - hex_data, hex_data_len); -} - -/**********************************************************************/ -int chunk_data_to_hex(const struct uds_chunk_data *chunk_data, - char *hex_data, - size_t hex_data_len) -{ - return data_to_hex(chunk_data->data, - UDS_METADATA_SIZE, - hex_data, - hex_data_len); -} - -/**********************************************************************/ -unsigned int compute_bits(unsigned int max_value) -{ - // __builtin_clz() counts leading (high-order) zero bits, so if - // we ever need this to be fast, under GCC we can do: - // return ((max_value == 0) ? 0 : (32 - __builtin_clz(max_value))); - - unsigned int bits = 0; - while (max_value > 0) { - max_value >>= 1; - bits++; - } - return bits; -} - -/**********************************************************************/ -void hash_utils_compile_time_assertions(void) -{ - STATIC_ASSERT((UDS_CHUNK_NAME_SIZE % sizeof(uint64_t)) == 0); - STATIC_ASSERT(UDS_CHUNK_NAME_SIZE == 16); -} diff --git a/uds/hashUtils.h b/uds/hashUtils.h deleted file mode 100644 index ad969e22..00000000 --- a/uds/hashUtils.h +++ /dev/null @@ -1,235 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/krusty/src/uds/hashUtils.h#13 $ - */ - -#ifndef HASH_UTILS_H -#define HASH_UTILS_H 1 - -#include "compiler.h" -#include "common.h" -#include "geometry.h" -#include "numeric.h" -#include "uds.h" - -// How various portions of a hash are apportioned. Size dependent. -enum { - VOLUME_INDEX_BYTES_OFFSET = 0, // size 8 - CHAPTER_INDEX_BYTES_OFFSET = 8, // size 6 - SAMPLE_BYTES_OFFSET = 14, // size 2 - VOLUME_INDEX_BYTES_COUNT = 8, - CHAPTER_INDEX_BYTES_COUNT = 6, - SAMPLE_BYTES_COUNT = 2, -}; - -/** - * Extract the portion of a block name used by the chapter index. - * - * @param name The block name - * - * @return The chapter index bytes - **/ -static INLINE uint64_t -extract_chapter_index_bytes(const struct uds_chunk_name *name) -{ - // Get the high order 16 bits, then the low order 32 bits - const byte *chapter_bits = &name->name[CHAPTER_INDEX_BYTES_OFFSET]; - uint64_t bytes = (uint64_t) get_unaligned_be16(chapter_bits) << 32; - bytes |= get_unaligned_be32(chapter_bits + 2); - return bytes; -} - -/** - * Extract the portion of a block name used by the volume index. - * - * @param name The block name - * - * @return The volume index portion of the block name - **/ -static INLINE uint64_t -extract_volume_index_bytes(const struct uds_chunk_name *name) -{ - return get_unaligned_be64(&name->name[VOLUME_INDEX_BYTES_OFFSET]); -} - -/** - * Extract the portion of a block name used for sparse sampling. - * - * @param name The block name - * - * @return The sparse sample portion of the block name - **/ -static INLINE uint32_t extract_sampling_bytes(const struct uds_chunk_name *name) -{ - return get_unaligned_be16(&name->name[SAMPLE_BYTES_OFFSET]); -} - -/** - * For a given block, find the chapter delta list to use - * - * @param name The block name to hash - * @param geometry The geometry to use - * - * @return The chapter delta list where we expect to find the given blockname - **/ -static INLINE unsigned int -hash_to_chapter_delta_list(const struct uds_chunk_name *name, - const struct geometry *geometry) -{ - return (unsigned int) ((extract_chapter_index_bytes(name) >> - geometry->chapter_address_bits) & - ((1 << geometry->chapter_delta_list_bits) - 1)); -} - -/** - * For a given block, find the chapter delta address to use - * - * @param name The block name to hash - * @param geometry The geometry to use - * - * @return The chapter delta address to use - **/ -static INLINE unsigned int -hash_to_chapter_delta_address(const struct uds_chunk_name *name, - const struct geometry *geometry) -{ - return (unsigned int) (extract_chapter_index_bytes(name) & - ((1 << geometry->chapter_address_bits) - 1)); -} - -/** - * For a given block name, find the slot in the open chapter hash table - * where it is expected to reside. - * - * @param name The block name to hash - * @param slot_count The size of the hash table - * - * @return the record number in the index page where we expect to find - # the given blockname - **/ -static INLINE unsigned int name_to_hash_slot(const struct uds_chunk_name *name, - unsigned int slot_count) -{ - return (unsigned int) (extract_chapter_index_bytes(name) % slot_count); -} - -/** - * Convert a chunk name to hex to make it more readable. - * - * @param chunk_name The chunk name - * @param hex_data The resulting hexdata from the given chunk name - * @param hex_data_len The capacity of hex_data - * - * @return UDS_SUCCESS, - * or UDS_INVALID_ARGUMENT if hex_data_len - * is too short. - **/ -int __must_check chunk_name_to_hex(const struct uds_chunk_name *chunk_name, - char *hex_data, - size_t hex_data_len); - -/** - * Convert chunk data to hex to make it more readable. - * - * @param chunk_data The chunk data - * @param hex_data The resulting hexdata from the given chunk data - * @param hex_data_len The capacity of hex_data - * - * @return UDS_SUCCESS, - * or UDS_INVALID_ARGUMENT if hex_data_len - * is too short. - **/ -int __must_check chunk_data_to_hex(const struct uds_chunk_data *chunk_data, - char *hex_data, - size_t hex_data_len); - -/** - * Compute the number of bits required to store a field with the given - * maximum value. - * - * @param max_value The maximum value of the field - * - * @return the number of bits required - **/ -unsigned int __must_check compute_bits(unsigned int max_value); - -/** - * FOR TESTING. Set the portion of a block name used by the chapter index. - * - * @param name The block name - * @param value The value to store - **/ -static INLINE void set_chapter_index_bytes(struct uds_chunk_name *name, - uint64_t value) -{ - // Store the high order bytes, then the low-order bytes - put_unaligned_be16((uint16_t)(value >> 32), - &name->name[CHAPTER_INDEX_BYTES_OFFSET]); - put_unaligned_be32((uint32_t) value, - &name->name[CHAPTER_INDEX_BYTES_OFFSET + 2]); -} - -/** - * FOR TESTING. Set the bits used to find a chapter delta list - * - * @param name The block name - * @param geometry The geometry to use - * @param value The value to store - **/ -static INLINE void set_chapter_delta_list_bits(struct uds_chunk_name *name, - const struct geometry *geometry, - uint64_t value) -{ - uint64_t delta_address = hash_to_chapter_delta_address(name, geometry); - delta_address |= value << geometry->chapter_address_bits; - set_chapter_index_bytes(name, delta_address); -} - -/** - * FOR TESTING. Set the portion of a block name used by the volume index. - * - * @param name The block name - * @param val The value to store - **/ -static INLINE void set_volume_index_bytes(struct uds_chunk_name *name, - uint64_t val) -{ - put_unaligned_be64(val, &name->name[VOLUME_INDEX_BYTES_OFFSET]); -} - -/** - * Set the portion of a block name used for sparse sampling. - * - * @param name The block name - * @param value The value to store - **/ -static INLINE void set_sampling_bytes(struct uds_chunk_name *name, - uint32_t value) -{ - put_unaligned_be16((uint16_t) value, &name->name[SAMPLE_BYTES_OFFSET]); -} - -/** - * Special function wrapper required for compile-time assertions. This - * function will fail to compile if UDS_CHUNK_NAME_SIZE is not an integer - * multiple of 8. - **/ -void hash_utils_compile_time_assertions(void); - -#endif /* HASH_UTILS_H */ diff --git a/uds/index.c b/uds/index.c deleted file mode 100644 index 15effde3..00000000 --- a/uds/index.c +++ /dev/null @@ -1,1295 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/krusty/src/uds/index.c#74 $ - */ - - -#include "index.h" - -#include "hashUtils.h" -#include "indexCheckpoint.h" -#include "indexStateData.h" -#include "logger.h" -#include "openChapter.h" -#include "requestQueue.h" -#include "zone.h" - -static const unsigned int MAX_COMPONENT_COUNT = 4; -static const uint64_t NO_LAST_CHECKPOINT = UINT_MAX; - - -/** - * Get the zone for a request. - * - * @param index The index - * @param request The request - * - * @return The zone for the request - **/ -static struct index_zone *get_request_zone(struct uds_index *index, - struct uds_request *request) -{ - return index->zones[request->zone_number]; -} - -/** - * Triage an index request, deciding whether it requires that a sparse cache - * barrier message precede it. - * - * This resolves the chunk name in the request in the volume index, - * determining if it is a hook or not, and if a hook, what virtual chapter (if - * any) it might be found in. If a virtual chapter is found, it checks whether - * that chapter appears in the sparse region of the index. If all these - * conditions are met, the (sparse) virtual chapter number is returned. In all - * other cases it returns UINT64_MAX. - * - * @param index the index that will process the request - * @param request the index request containing the chunk name to triage - * - * @return the sparse chapter number for the sparse cache barrier message, or - * UINT64_MAX if the request does not require a barrier - **/ -static uint64_t triage_index_request(struct uds_index *index, - struct uds_request *request) -{ - struct volume_index_triage triage; - struct index_zone *zone; - lookup_volume_index_name(index->volume_index, &request->chunk_name, - &triage); - if (!triage.in_sampled_chapter) { - // Not indexed or not a hook. - return UINT64_MAX; - } - - zone = get_request_zone(index, request); - if (!is_zone_chapter_sparse(zone, triage.virtual_chapter)) { - return UINT64_MAX; - } - - // XXX Optimize for a common case by remembering the chapter from the - // most recent barrier message and skipping this chapter if is it the - // same. - - // Return the sparse chapter number to trigger the barrier messages. - return triage.virtual_chapter; -} - -/** - * Construct and enqueue asynchronous control messages to add the chapter - * index for a given virtual chapter to the sparse chapter index cache. - * - * @param index the index with the relevant cache and chapter - * @param virtual_chapter the virtual chapter number of the chapter to cache - **/ -static void enqueue_barrier_messages(struct uds_index *index, - uint64_t virtual_chapter) -{ - struct uds_zone_message message = { - .type = UDS_MESSAGE_SPARSE_CACHE_BARRIER, - .index = index, - .virtual_chapter = virtual_chapter, - }; - unsigned int zone; - for (zone = 0; zone < index->zone_count; zone++) { - int result = launch_zone_message(message, zone, index); - ASSERT_LOG_ONLY((result == UDS_SUCCESS), - "barrier message allocation"); - } -} - -/** - * Simulate the creation of a sparse cache barrier message by the triage - * queue, and the later execution of that message in an index zone. - * - * If the index receiving the request is multi-zone or dense, this function - * does nothing. This simulation is an optimization for single-zone sparse - * indexes. It also supports unit testing of indexes without queues. - * - * @param zone the index zone responsible for the index request - * @param request the index request about to be executed - * - * @return UDS_SUCCESS always - **/ -static int simulate_index_zone_barrier_message(struct index_zone *zone, - struct uds_request *request) -{ - uint64_t sparse_virtual_chapter; - // Do nothing unless this is a single-zone sparse index. - if ((zone->index->zone_count > 1) || - !is_sparse(zone->index->volume->geometry)) { - return UDS_SUCCESS; - } - - // Check if the index request is for a sampled name in a sparse - // chapter. - sparse_virtual_chapter = triage_index_request(zone->index, request); - if (sparse_virtual_chapter == UINT64_MAX) { - // Not indexed, not a hook, or in a chapter that is still - // dense, which means there should be no change to the sparse - // chapter index cache. - return UDS_SUCCESS; - } - - /* - * The triage queue would have generated and enqueued a barrier message - * preceding this request, which we simulate by directly invoking the - * message function. - */ - return update_sparse_cache(zone, sparse_virtual_chapter); -} - -/** - * This is the request processing function for the triage stage queue. Each - * request is resolved in the volume index, determining if it is a hook or - * not, and if a hook, what virtual chapter (if any) it might be found in. If - * a virtual chapter is found, this enqueues a sparse chapter cache barrier in - * every zone before enqueueing the request in its zone. - * - * @param request the request to triage - **/ -static void triage_request(struct uds_request *request) -{ - struct uds_index *index = request->index; - - // Check if the name is a hook in the index pointing at a sparse - // chapter. - uint64_t sparse_virtual_chapter = triage_index_request(index, request); - if (sparse_virtual_chapter != UINT64_MAX) { - // Generate and place a barrier request on every zone queue. - enqueue_barrier_messages(index, sparse_virtual_chapter); - } - - enqueue_request(request, STAGE_INDEX); -} - -/** - * This is the request processing function invoked by the zone's - * uds_request_queue worker thread. - * - * @param request the request to be indexed or executed by the zone worker - **/ -static void execute_zone_request(struct uds_request *request) -{ - int result; - struct uds_index *index = request->index; - - if (request->zone_message.type != UDS_MESSAGE_NONE) { - result = dispatch_index_zone_control_request(request); - if (result != UDS_SUCCESS) { - uds_log_error_strerror(result, - "error executing message: %d", - request->zone_message.type); - } - /* - * Asynchronous control messages are complete when they are - * executed. There should be nothing they need to do on the - * callback thread. The message has been completely processed, - * so just free it. - */ - UDS_FREE(UDS_FORGET(request)); - return; - } - - index->need_to_save = true; - if (request->requeued && !is_successful(request->status)) { - index->callback(request); - return; - } - - result = dispatch_index_request(index, request); - if (result == UDS_QUEUED) { - // Take the request off the pipeline. - return; - } - - request->status = result; - index->callback(request); -} - -/** - * Initialize the zone queues and the triage queue. - * - * @param index the index containing the queues - * @param geometry the geometry governing the indexes - * - * @return UDS_SUCCESS or error code - **/ -static int initialize_index_queues(struct uds_index *index, - const struct geometry *geometry) -{ - unsigned int i; - for (i = 0; i < index->zone_count; i++) { - int result = make_uds_request_queue("indexW", - &execute_zone_request, - &index->zone_queues[i]); - if (result != UDS_SUCCESS) { - return result; - } - } - - // The triage queue is only needed for sparse multi-zone indexes. - if ((index->zone_count > 1) && is_sparse(geometry)) { - int result = make_uds_request_queue("triageW", &triage_request, - &index->triage_queue); - if (result != UDS_SUCCESS) { - return result; - } - } - - return UDS_SUCCESS; -} - -/** - * Replay an index which was loaded from a checkpoint. - * - * @param index The index to replay - * @param last_checkpoint_chapter The number of the chapter where the - * last checkpoint was made - * - * @return UDS_SUCCESS or an error code. - **/ -static int replay_index_from_checkpoint(struct uds_index *index, - uint64_t last_checkpoint_chapter) -{ - // Find the volume chapter boundaries - unsigned int chapters_per_volume; - int result; - uint64_t lowest_vcn, highest_vcn, first_replay_chapter; - bool is_empty = false; - enum index_lookup_mode old_lookup_mode = index->volume->lookup_mode; - index->volume->lookup_mode = LOOKUP_FOR_REBUILD; - result = find_volume_chapter_boundaries(index->volume, - &lowest_vcn, &highest_vcn, - &is_empty); - index->volume->lookup_mode = old_lookup_mode; - if (result != UDS_SUCCESS) { - return uds_log_fatal_strerror(result, - "cannot replay index: unknown volume chapter boundaries"); - } - if (lowest_vcn > highest_vcn) { - uds_log_fatal("cannot replay index: no valid chapters exist"); - return UDS_CORRUPT_COMPONENT; - } - - if (is_empty) { - // The volume is empty, so the index should also be empty - if (index->newest_virtual_chapter != 0) { - uds_log_fatal("cannot replay index from empty volume"); - return UDS_CORRUPT_COMPONENT; - } - return UDS_SUCCESS; - } - - chapters_per_volume = index->volume->geometry->chapters_per_volume; - index->oldest_virtual_chapter = lowest_vcn; - index->newest_virtual_chapter = highest_vcn + 1; - if (index->newest_virtual_chapter == - lowest_vcn + chapters_per_volume) { - // skip the chapter shadowed by the open chapter - index->oldest_virtual_chapter++; - } - - first_replay_chapter = last_checkpoint_chapter; - if (first_replay_chapter < index->oldest_virtual_chapter) { - first_replay_chapter = index->oldest_virtual_chapter; - } - return replay_volume(index, first_replay_chapter); -} - -/**********************************************************************/ -static int load_index(struct uds_index *index, bool allow_replay) -{ - uint64_t last_checkpoint_chapter; - unsigned int i; - bool replay_required = false; - - int result = load_index_state(index->state, &replay_required); - if (result != UDS_SUCCESS) { - return result; - } - - if (replay_required && !allow_replay) { - return uds_log_error_strerror(UDS_INDEX_NOT_SAVED_CLEANLY, - "index not saved cleanly: open chapter missing"); - } - - last_checkpoint_chapter = - ((index->last_checkpoint != NO_LAST_CHECKPOINT) ? - index->last_checkpoint : - 0); - - uds_log_info("loaded index from chapter %llu through chapter %llu", - (unsigned long long) index->oldest_virtual_chapter, - (unsigned long long) last_checkpoint_chapter); - - if (replay_required) { - result = replay_index_from_checkpoint(index, - last_checkpoint_chapter); - if (result != UDS_SUCCESS) { - return result; - } - } - - for (i = 0; i < index->zone_count; i++) { - set_active_chapters(index->zones[i]); - } - - index->loaded_type = replay_required ? LOAD_REPLAY : LOAD_LOAD; - return UDS_SUCCESS; -} - -/**********************************************************************/ -static int rebuild_index(struct uds_index *index) -{ - // Find the volume chapter boundaries - int result; - unsigned int i; - uint64_t lowest_vcn, highest_vcn; - bool is_empty = false; - enum index_lookup_mode old_lookup_mode = index->volume->lookup_mode; - index->volume->lookup_mode = LOOKUP_FOR_REBUILD; - result = find_volume_chapter_boundaries(index->volume, &lowest_vcn, - &highest_vcn, &is_empty); - index->volume->lookup_mode = old_lookup_mode; - if (result != UDS_SUCCESS) { - return uds_log_fatal_strerror(result, - "cannot rebuild index: unknown volume chapter boundaries"); - } - if (lowest_vcn > highest_vcn) { - uds_log_fatal("cannot rebuild index: no valid chapters exist"); - return UDS_CORRUPT_COMPONENT; - } - - if (is_empty) { - index->newest_virtual_chapter = - index->oldest_virtual_chapter = 0; - } else { - unsigned int num_chapters = - index->volume->geometry->chapters_per_volume; - index->newest_virtual_chapter = highest_vcn + 1; - index->oldest_virtual_chapter = lowest_vcn; - if (index->newest_virtual_chapter == - (index->oldest_virtual_chapter + num_chapters)) { - // skip the chapter shadowed by the open chapter - index->oldest_virtual_chapter++; - } - } - - if ((index->newest_virtual_chapter - index->oldest_virtual_chapter) > - index->volume->geometry->chapters_per_volume) { - return uds_log_fatal_strerror(UDS_CORRUPT_COMPONENT, - "cannot rebuild index: volume chapter boundaries too large"); - } - - set_volume_index_open_chapter(index->volume_index, 0); - if (is_empty) { - index->loaded_type = LOAD_EMPTY; - return UDS_SUCCESS; - } - - result = replay_volume(index, index->oldest_virtual_chapter); - if (result != UDS_SUCCESS) { - return result; - } - - for (i = 0; i < index->zone_count; i++) { - set_active_chapters(index->zones[i]); - } - - index->loaded_type = LOAD_REBUILD; - return UDS_SUCCESS; -} - -/**********************************************************************/ -int allocate_index(struct index_layout *layout, - const struct configuration *config, - const struct uds_parameters *user_params, - unsigned int zone_count, - struct uds_index **new_index) -{ - struct uds_index *index; - int result; - unsigned int i; - unsigned int checkpoint_frequency = - user_params == NULL ? 0 : user_params->checkpoint_frequency; - if (checkpoint_frequency >= config->geometry->chapters_per_volume) { - uds_log_error("checkpoint frequency too large"); - return -EINVAL; - } - - result = UDS_ALLOCATE_EXTENDED(struct uds_index, - zone_count, - struct uds_request_queue *, - "index", - &index); - if (result != UDS_SUCCESS) { - return result; - } - - index->loaded_type = LOAD_UNDEFINED; - - result = make_index_checkpoint(index); - if (result != UDS_SUCCESS) { - free_index(index); - return result; - } - set_index_checkpoint_frequency(index->checkpoint, - checkpoint_frequency); - - get_uds_index_layout(layout, &index->layout); - index->zone_count = zone_count; - - result = UDS_ALLOCATE(index->zone_count, struct index_zone *, "zones", - &index->zones); - if (result != UDS_SUCCESS) { - free_index(index); - return result; - } - - result = make_index_state(layout, index->zone_count, - MAX_COMPONENT_COUNT, &index->state); - if (result != UDS_SUCCESS) { - free_index(index); - return result; - } - - result = add_index_state_component(index->state, &INDEX_STATE_INFO, - index, NULL); - if (result != UDS_SUCCESS) { - free_index(index); - return result; - } - - result = make_volume(config, index->layout, - user_params, - VOLUME_CACHE_DEFAULT_MAX_QUEUED_READS, - index->zone_count, &index->volume); - if (result != UDS_SUCCESS) { - free_index(index); - return result; - } - index->volume->lookup_mode = LOOKUP_NORMAL; - - for (i = 0; i < index->zone_count; i++) { - result = make_index_zone(index, i); - if (result != UDS_SUCCESS) { - free_index(index); - return uds_log_error_strerror(result, - "Could not create index zone"); - } - } - - result = add_index_state_component(index->state, &OPEN_CHAPTER_INFO, - index, NULL); - if (result != UDS_SUCCESS) { - free_index(index); - return uds_log_error_strerror(result, - "Could not create open chapter"); - } - - *new_index = index; - return UDS_SUCCESS; -} - -/**********************************************************************/ -int make_index(struct index_layout *layout, - const struct configuration *config, - const struct uds_parameters *user_params, - enum load_type load_type, - struct index_load_context *load_context, - index_callback_t callback, - struct uds_index **new_index) -{ - struct uds_index *index; - uint64_t nonce; - unsigned int zone_count = get_zone_count(user_params); - int result = allocate_index(layout, config, user_params, zone_count, - &index); - if (result != UDS_SUCCESS) { - return uds_log_error_strerror(result, - "could not allocate index"); - } - - index->load_context = load_context; - index->callback = callback; - - result = initialize_index_queues(index, config->geometry); - if (result != UDS_SUCCESS) { - free_index(index); - return result; - } - - nonce = get_uds_volume_nonce(layout); - result = make_volume_index(config, zone_count, nonce, - &index->volume_index); - if (result != UDS_SUCCESS) { - free_index(index); - return uds_log_error_strerror(result, - "could not make volume index"); - } - - result = add_index_state_component(index->state, VOLUME_INDEX_INFO, - NULL, index->volume_index); - if (result != UDS_SUCCESS) { - free_index(index); - return result; - } - - result = add_index_state_component(index->state, - &INDEX_PAGE_MAP_INFO, - index->volume->index_page_map, - NULL); - if (result != UDS_SUCCESS) { - free_index(index); - return result; - } - - result = make_chapter_writer(index, &index->chapter_writer); - if (result != UDS_SUCCESS) { - free_index(index); - return result; - } - - if ((load_type == LOAD_LOAD) || (load_type == LOAD_REBUILD)) { - result = load_index(index, load_type == LOAD_REBUILD); - switch (result) { - case UDS_SUCCESS: - break; - case -ENOMEM: - // We should not try a rebuild for this error. - uds_log_error_strerror(result, - "index could not be loaded"); - break; - default: - uds_log_error_strerror(result, - "index could not be loaded"); - if (load_type == LOAD_REBUILD) { - result = rebuild_index(index); - if (result != UDS_SUCCESS) { - uds_log_error_strerror(result, - "index could not be rebuilt"); - } - } - break; - } - } else { - index->loaded_type = LOAD_CREATE; - discard_index_state_data(index->state); - } - - if (result != UDS_SUCCESS) { - free_index(index); - return uds_log_error_strerror(result, - "fatal error in make_index"); - } - - if (index->load_context != NULL) { - uds_lock_mutex(&index->load_context->mutex); - index->load_context->status = INDEX_READY; - // If we get here, suspend is meaningless, but notify any - // thread trying to suspend us so it doesn't hang. - uds_broadcast_cond(&index->load_context->cond); - uds_unlock_mutex(&index->load_context->mutex); - } - - index->has_saved_open_chapter = (index->loaded_type == LOAD_LOAD); - index->need_to_save = (index->loaded_type != LOAD_LOAD); - *new_index = index; - return UDS_SUCCESS; -} - -/**********************************************************************/ -void free_index(struct uds_index *index) -{ - unsigned int i; - - if (index == NULL) { - return; - } - - uds_request_queue_finish(index->triage_queue); - for (i = 0; i < index->zone_count; i++) { - uds_request_queue_finish(index->zone_queues[i]); - } - - free_chapter_writer(index->chapter_writer); - - if (index->volume_index != NULL) { - free_volume_index(index->volume_index); - } - - if (index->zones != NULL) { - for (i = 0; i < index->zone_count; i++) { - free_index_zone(index->zones[i]); - } - UDS_FREE(index->zones); - } - - free_volume(index->volume); - free_index_state(index->state); - free_index_checkpoint(index->checkpoint); - put_uds_index_layout(UDS_FORGET(index->layout)); - UDS_FREE(index); -} - -/**********************************************************************/ -int save_index(struct uds_index *index) -{ - int result; - - if (!index->need_to_save) { - return UDS_SUCCESS; - } - wait_for_idle_chapter_writer(index->chapter_writer); - result = finish_checkpointing(index); - if (result != UDS_SUCCESS) { - uds_log_info("save index failed"); - return result; - } - begin_save(index, false, index->newest_virtual_chapter); - - result = save_index_state(index->state); - if (result != UDS_SUCCESS) { - uds_log_info("save index failed"); - index->last_checkpoint = index->prev_checkpoint; - } else { - index->has_saved_open_chapter = true; - index->need_to_save = false; - uds_log_info("finished save (vcn %llu)", - (unsigned long long) index->last_checkpoint); - } - return result; -} - -/**********************************************************************/ -int replace_index_storage(struct uds_index *index, const char *path) -{ - return replace_volume_storage(index->volume, index->layout, path); -} - -/** - * Search an index zone. This function is only correct for LRU. - * - * @param zone The index zone to query. - * @param request The request originating the query. - * - * @return UDS_SUCCESS or an error code - **/ -static int search_index_zone(struct index_zone *zone, - struct uds_request *request) -{ - struct volume_index_record record; - bool overflow_record, found = false; - struct uds_chunk_data *metadata; - uint64_t chapter; - int result = get_volume_index_record(zone->index->volume_index, - &request->chunk_name, &record); - if (result != UDS_SUCCESS) { - return result; - } - - if (record.is_found) { - result = get_record_from_zone(zone, request, &found, - record.virtual_chapter); - if (result != UDS_SUCCESS) { - return result; - } - if (found) { - request->location = - compute_index_region(zone, - record.virtual_chapter); - } - } - - /* - * If a record has overflowed a chapter index in more than one chapter - * (or overflowed in one chapter and collided with an existing record), - * it will exist as a collision record in the volume index, but - * we won't find it in the volume. This case needs special handling. - */ - overflow_record = (record.is_found && record.is_collision && !found); - chapter = zone->newest_virtual_chapter; - if (found || overflow_record) { - if ((request->type == UDS_QUERY) && - (!request->update || overflow_record)) { - /* This is a query without update, or with nothing to - * update */ - return UDS_SUCCESS; - } - - if (record.virtual_chapter != chapter) { - /* - * Update the volume index to reference the new chapter - * for the block. If the record had been deleted or - * dropped from the chapter index, it will be back. - */ - result = set_volume_index_record_chapter(&record, - chapter); - } else if (request->type != UDS_UPDATE) { - /* The record is already in the open chapter, so we're - * done */ - return UDS_SUCCESS; - } - } else { - // The record wasn't in the volume index, so check whether the - // name is in a cached sparse chapter. - if (!is_volume_index_sample(zone->index->volume_index, - &request->chunk_name) && - is_sparse(zone->index->volume->geometry)) { - // Passing UINT64_MAX triggers a search of the entire - // sparse cache. - result = search_sparse_cache_in_zone(zone, request, - UINT64_MAX, - &found); - if (result != UDS_SUCCESS) { - return result; - } - - if (found) { - request->location = UDS_LOCATION_IN_SPARSE; - } - } - - if (request->type == UDS_QUERY) { - if (!found || !request->update) { - // This is a query without update or for a new - // record, so we're done. - return UDS_SUCCESS; - } - } - - /* - * Add a new entry to the volume index referencing the open - * chapter. This needs to be done both for new records, and for - * records from cached sparse chapters. - */ - result = put_volume_index_record(&record, chapter); - } - - if (result == UDS_OVERFLOW) { - /* - * The volume index encountered a delta list overflow. The - * condition was already logged. We will go on without adding - * the chunk to the open chapter. - */ - return UDS_SUCCESS; - } - - if (result != UDS_SUCCESS) { - return result; - } - - if (!found || (request->type == UDS_UPDATE)) { - // This is a new record or we're updating an existing record. - metadata = &request->new_metadata; - } else { - // This is a duplicate, so move the record to the open chapter - // (for LRU). - metadata = &request->old_metadata; - } - return put_record_in_zone(zone, request, metadata); -} - -/**********************************************************************/ -static int remove_from_index_zone(struct index_zone *zone, - struct uds_request *request) -{ - struct volume_index_record record; - int result = get_volume_index_record(zone->index->volume_index, - &request->chunk_name, &record); - if (result != UDS_SUCCESS) { - return result; - } - - if (!record.is_found) { - // The name does not exist in volume index, so there is nothing - // to remove. - return UDS_SUCCESS; - } - - if (!record.is_collision) { - // Non-collision records are hints, so resolve the name in the - // chapter. - bool found; - int result = get_record_from_zone(zone, request, &found, - record.virtual_chapter); - if (result != UDS_SUCCESS) { - return result; - } - - if (!found) { - // The name does not exist in the chapter, so there is - // nothing to remove. - return UDS_SUCCESS; - } - } - - request->location = compute_index_region(zone, record.virtual_chapter); - - /* - * Delete the volume index entry for the named record only. Note that a - * later search might later return stale advice if there is a colliding - * name in the same chapter, but it's a very rare case (1 in 2^21). - */ - result = remove_volume_index_record(&record); - if (result != UDS_SUCCESS) { - return result; - } - - // If the record is in the open chapter, we must remove it or mark it - // deleted to avoid trouble if the record is added again later. - if (request->location == UDS_LOCATION_IN_OPEN_CHAPTER) { - bool hash_exists = false; - remove_from_open_chapter(zone->open_chapter, - &request->chunk_name, - &hash_exists); - result = ASSERT(hash_exists, - "removing record not found in open chapter"); - if (result != UDS_SUCCESS) { - return result; - } - } - - return UDS_SUCCESS; -} - -/**********************************************************************/ -int dispatch_index_request(struct uds_index *index, - struct uds_request *request) -{ - int result; - struct index_zone *zone = get_request_zone(index, request); - - if (!request->requeued) { - // Single-zone sparse indexes don't have a triage queue to - // generate cache barrier requests, so see if we need to - // synthesize a barrier. - int result = - simulate_index_zone_barrier_message(zone, request); - if (result != UDS_SUCCESS) { - return result; - } - } - - request->location = UDS_LOCATION_UNKNOWN; - - switch (request->type) { - case UDS_POST: - case UDS_UPDATE: - case UDS_QUERY: - result = search_index_zone(zone, request); - break; - - case UDS_DELETE: - result = remove_from_index_zone(zone, request); - break; - - default: - result = uds_log_warning_strerror(UDS_INVALID_ARGUMENT, - "invalid request type: %d", - request->type); - break; - } - - if (request->location == UDS_LOCATION_UNKNOWN) { - request->location = UDS_LOCATION_UNAVAILABLE; - } - return result; -} - -/**********************************************************************/ -static int rebuild_index_page_map(struct uds_index *index, uint64_t vcn) -{ - struct geometry *geometry = index->volume->geometry; - unsigned int chapter = map_to_physical_chapter(geometry, vcn); - unsigned int expected_list_number = 0; - unsigned int index_page_number; - for (index_page_number = 0; - index_page_number < geometry->index_pages_per_chapter; - index_page_number++) { - unsigned int lowest_delta_list, highest_delta_list; - struct delta_index_page *chapter_index_page; - int result = get_volume_page(index->volume, - chapter, index_page_number, - CACHE_PROBE_INDEX_FIRST, NULL, - &chapter_index_page); - if (result != UDS_SUCCESS) { - return uds_log_error_strerror(result, - "failed to read index page %u in chapter %u", - index_page_number, - chapter); - } - lowest_delta_list = chapter_index_page->lowest_list_number; - highest_delta_list = chapter_index_page->highest_list_number; - if (lowest_delta_list != expected_list_number) { - return uds_log_error_strerror(UDS_CORRUPT_DATA, - "chapter %u index page %u is corrupt", - chapter, - index_page_number); - } - result = update_index_page_map(index->volume->index_page_map, - vcn, - chapter, - index_page_number, - highest_delta_list); - if (result != UDS_SUCCESS) { - return uds_log_error_strerror(result, - "failed to update chapter %u index page %u", - chapter, - index_page_number); - } - expected_list_number = highest_delta_list + 1; - } - return UDS_SUCCESS; -} - -/** - * Add an entry to the volume index when rebuilding. - * - * @param index The index to query. - * @param name The block name of interest. - * @param virtual_chapter The virtual chapter number to write to the - * volume index - * @param will_be_sparse_chapter True if this entry will be in the sparse - * portion of the index at the end of - * rebuilding - * - * @return UDS_SUCCESS or an error code - **/ -static int replay_record(struct uds_index *index, - const struct uds_chunk_name *name, - uint64_t virtual_chapter, - bool will_be_sparse_chapter) -{ - struct volume_index_record record; - bool update_record; - int result; - if (will_be_sparse_chapter && - !is_volume_index_sample(index->volume_index, name)) { - // This entry will be in a sparse chapter after the rebuild - // completes, and it is not a sample, so just skip over it. - return UDS_SUCCESS; - } - - result = get_volume_index_record(index->volume_index, name, &record); - if (result != UDS_SUCCESS) { - return result; - } - - if (record.is_found) { - if (record.is_collision) { - if (record.virtual_chapter == virtual_chapter) { - /* The record is already correct, so we don't - * need to do anything */ - return UDS_SUCCESS; - } - update_record = true; - } else if (record.virtual_chapter == virtual_chapter) { - /* - * There is a volume index entry pointing to the - * current chapter, but we don't know if it is for the - * same name as the one we are currently working on or - * not. For now, we're just going to assume that it - * isn't. This will create one extra collision record - * if there was a deleted record in the current - * chapter. - */ - update_record = false; - } else { - /* - * If we're rebuilding, we don't normally want to go to - * disk to see if the record exists, since we will - * likely have just read the record from disk (i.e. we - * know it's there). The exception to this is when we - * already find an entry in the volume index that has a - * different chapter. In this case, we need to search - * that chapter to determine if the volume index entry - * was for the same record or a different one. - */ - result = search_volume_page_cache(index->volume, - NULL, name, - record.virtual_chapter, - NULL, &update_record); - if (result != UDS_SUCCESS) { - return result; - } - } - } else { - update_record = false; - } - - if (update_record) { - /* - * Update the volume index to reference the new chapter for the - * block. If the record had been deleted or dropped from the - * chapter index, it will be back. - */ - result = set_volume_index_record_chapter(&record, - virtual_chapter); - } else { - /* - * Add a new entry to the volume index referencing the open - * chapter. This should be done regardless of whether we are a - * brand new record or a sparse record, i.e. one that doesn't - * exist in the index but does on disk, since for a sparse - * record, we would want to un-sparsify if it did exist. - */ - result = put_volume_index_record(&record, virtual_chapter); - } - - if ((result == UDS_DUPLICATE_NAME) || (result == UDS_OVERFLOW)) { - /* Ignore duplicate record and delta list overflow errors */ - return UDS_SUCCESS; - } - - return result; -} - -/**********************************************************************/ -void begin_save(struct uds_index *index, - bool checkpoint, - uint64_t open_chapter_number) -{ - index->prev_checkpoint = index->last_checkpoint; - index->last_checkpoint = - ((open_chapter_number == 0) ? NO_LAST_CHECKPOINT : - open_chapter_number - 1); - uds_log_info("beginning %s (vcn %llu)", - (checkpoint ? "checkpoint" : "save"), - (unsigned long long) index->last_checkpoint); -} - -/** - * Suspend the index if necessary and wait for a signal to resume. - * - * @param index The index to replay - * - * @return true if the replay should terminate - **/ -static bool check_for_suspend(struct uds_index *index) -{ - bool ret_val; - if (index->load_context == NULL) { - return false; - } - - uds_lock_mutex(&index->load_context->mutex); - if (index->load_context->status != INDEX_SUSPENDING) { - uds_unlock_mutex(&index->load_context->mutex); - return false; - } - - // Notify that we are suspended and wait for the resume. - index->load_context->status = INDEX_SUSPENDED; - uds_broadcast_cond(&index->load_context->cond); - - while ((index->load_context->status != INDEX_OPENING) && - (index->load_context->status != INDEX_FREEING)) { - uds_wait_cond(&index->load_context->cond, - &index->load_context->mutex); - } - - ret_val = (index->load_context->status == INDEX_FREEING); - uds_unlock_mutex(&index->load_context->mutex); - return ret_val; -} - -/**********************************************************************/ -int replay_volume(struct uds_index *index, uint64_t from_vcn) -{ - int result; - unsigned int j, k; - enum index_lookup_mode old_lookup_mode; - const struct geometry *geometry; - uint64_t old_ipm_update, new_ipm_update, vcn; - uint64_t upto_vcn = index->newest_virtual_chapter; - uds_log_info("Replaying volume from chapter %llu through chapter %llu", - (unsigned long long) from_vcn, - (unsigned long long) upto_vcn); - set_volume_index_open_chapter(index->volume_index, upto_vcn); - set_volume_index_open_chapter(index->volume_index, from_vcn); - - /* - * At least two cases to deal with here! - * - index loaded but replaying from last_checkpoint; maybe full, maybe - * not - * - index failed to load, full rebuild - * Starts empty, then dense-only, then dense-plus-sparse. - * Need to sparsify while processing individual chapters. - */ - old_lookup_mode = index->volume->lookup_mode; - index->volume->lookup_mode = LOOKUP_FOR_REBUILD; - /* - * Go through each record page of each chapter and add the records back - * to the volume index. This should not cause anything to be written - * to either the open chapter or on disk volume. Also skip the on disk - * chapter corresponding to upto, as this would have already been - * purged from the volume index when the chapter was opened. - * - * Also, go through each index page for each chapter and rebuild the - * index page map. - */ - geometry = index->volume->geometry; - old_ipm_update = get_last_update(index->volume->index_page_map); - for (vcn = from_vcn; vcn < upto_vcn; ++vcn) { - bool will_be_sparse_chapter; - unsigned int chapter; - if (check_for_suspend(index)) { - uds_log_info("Replay interrupted by index shutdown at chapter %llu", - (unsigned long long) vcn); - return -EBUSY; - } - - will_be_sparse_chapter = - is_chapter_sparse(geometry, from_vcn, upto_vcn, vcn); - chapter = map_to_physical_chapter(geometry, vcn); - prefetch_volume_pages(&index->volume->volume_store, - map_to_physical_page(geometry, chapter, 0), - geometry->pages_per_chapter); - set_volume_index_open_chapter(index->volume_index, vcn); - result = rebuild_index_page_map(index, vcn); - if (result != UDS_SUCCESS) { - index->volume->lookup_mode = old_lookup_mode; - return uds_log_error_strerror(result, - "could not rebuild index page map for chapter %u", - chapter); - } - - for (j = 0; j < geometry->record_pages_per_chapter; j++) { - byte *record_page; - unsigned int record_page_number = - geometry->index_pages_per_chapter + j; - result = get_volume_page(index->volume, chapter, - record_page_number, - CACHE_PROBE_RECORD_FIRST, - &record_page, NULL); - if (result != UDS_SUCCESS) { - index->volume->lookup_mode = old_lookup_mode; - return uds_log_error_strerror(result, - "could not get page %d", - record_page_number); - } - for (k = 0; k < geometry->records_per_page; k++) { - const byte *name_bytes = - record_page + (k * BYTES_PER_RECORD); - - struct uds_chunk_name name; - memcpy(&name.name, name_bytes, - UDS_CHUNK_NAME_SIZE); - - result = replay_record(index, &name, vcn, - will_be_sparse_chapter); - if (result != UDS_SUCCESS) { - char hex_name[(2 * UDS_CHUNK_NAME_SIZE) + - 1]; - if (chunk_name_to_hex(&name, hex_name, - sizeof(hex_name)) != - UDS_SUCCESS) { - strncpy(hex_name, "", - sizeof(hex_name)); - } - index->volume->lookup_mode = - old_lookup_mode; - return uds_log_error_strerror(result, - "could not find block %s during rebuild", - hex_name); - } - } - } - } - index->volume->lookup_mode = old_lookup_mode; - - // We also need to reap the chapter being replaced by the open chapter - set_volume_index_open_chapter(index->volume_index, upto_vcn); - - new_ipm_update = get_last_update(index->volume->index_page_map); - - if (new_ipm_update != old_ipm_update) { - uds_log_info("replay changed index page map update from %llu to %llu", - (unsigned long long) old_ipm_update, - (unsigned long long) new_ipm_update); - } - - return UDS_SUCCESS; -} - -/**********************************************************************/ -void get_index_stats(struct uds_index *index, struct uds_index_stats *counters) -{ - uint64_t cw_allocated = - get_chapter_writer_memory_allocated(index->chapter_writer); - // We're accessing the volume index while not on a zone thread, but - // that's safe to do when acquiring statistics. - struct volume_index_stats dense_stats, sparse_stats; - get_volume_index_stats(index->volume_index, &dense_stats, - &sparse_stats); - - counters->entries_indexed = - (dense_stats.record_count + sparse_stats.record_count); - counters->memory_used = - ((uint64_t) dense_stats.memory_allocated + - (uint64_t) sparse_stats.memory_allocated + - (uint64_t) get_cache_size(index->volume) + cw_allocated); - counters->collisions = - (dense_stats.collision_count + sparse_stats.collision_count); - counters->entries_discarded = - (dense_stats.discard_count + sparse_stats.discard_count); -} - -/**********************************************************************/ -void advance_active_chapters(struct uds_index *index) -{ - index->newest_virtual_chapter++; - index->oldest_virtual_chapter += - chapters_to_expire(index->volume->geometry, - index->newest_virtual_chapter); -} - -/**********************************************************************/ -struct uds_request_queue *select_index_queue(struct uds_index *index, - struct uds_request *request, - enum request_stage next_stage) -{ - switch (next_stage) { - case STAGE_TRIAGE: - // The triage queue is only needed for multi-zone sparse - // indexes and won't be allocated by the index if not needed, - // so simply check for NULL. - if (index->triage_queue != NULL) { - return index->triage_queue; - } - // Dense index or single zone, so route it directly to the zone - // queue. - fallthrough; - - case STAGE_INDEX: - request->zone_number = - get_volume_index_zone(index->volume_index, - &request->chunk_name); - fallthrough; - - case STAGE_MESSAGE: - return index->zone_queues[request->zone_number]; - - default: - ASSERT_LOG_ONLY(false, "invalid index stage: %d", next_stage); - } - - return NULL; -} diff --git a/uds/index.h b/uds/index.h deleted file mode 100644 index ad18a304..00000000 --- a/uds/index.h +++ /dev/null @@ -1,251 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/krusty/src/uds/index.h#29 $ - */ - -#ifndef INDEX_H -#define INDEX_H - -#include "chapterWriter.h" -#include "indexLayout.h" -#include "indexSession.h" -#include "indexZone.h" -#include "loadType.h" -#include "volumeIndexOps.h" -#include "request.h" -#include "volume.h" - - -/** - * Index checkpoint state private to indexCheckpoint.c. - **/ -struct index_checkpoint; - -/** - * Callback after a query, update or remove request completes and fills in - * select fields in the request: status for all requests, oldMetadata and - * hashExists for query and update requests. - * - * @param request request object - **/ -typedef void (*index_callback_t)(struct uds_request *request); - -struct uds_index { - bool has_saved_open_chapter; - bool need_to_save; - enum load_type loaded_type; - struct index_load_context *load_context; - struct index_layout *layout; - struct index_state *state; - struct volume_index *volume_index; - struct volume *volume; - unsigned int zone_count; - struct index_zone **zones; - - /* - * ATTENTION!!! - * The meaning of the next two fields has changed. - * - * They now represent the oldest and newest chapters only at load time, - * and when the index is quiescent. At other times, they may lag - * individual zones' views of the index depending upon the progress - * made by the chapter writer. - */ - uint64_t oldest_virtual_chapter; - uint64_t newest_virtual_chapter; - - uint64_t last_checkpoint; - uint64_t prev_checkpoint; - struct chapter_writer *chapter_writer; - - // checkpoint state used by indexCheckpoint.c - struct index_checkpoint *checkpoint; - - index_callback_t callback; - struct uds_request_queue *triage_queue; - struct uds_request_queue *zone_queues[]; -}; - -/** - * Construct a new index from the given configuration. - * - * @param layout The index layout - * @param config The configuration to use - * @param user_params The index session parameters. If NULL, the default - * session parameters will be used. - * @param load_type How to create the index: it can be create only, allow - * loading from files, and allow rebuilding from the - * volume - * @param load_context The load context to use - * @param callback the function to invoke when a request completes - * @param new_index A pointer to hold a pointer to the new index - * - * @return UDS_SUCCESS or an error code - **/ -int __must_check make_index(struct index_layout *layout, - const struct configuration *config, - const struct uds_parameters *user_params, - enum load_type load_type, - struct index_load_context *load_context, - index_callback_t callback, - struct uds_index **new_index); - -/** - * Construct a new index from the given configuration. - * - * @param layout The index layout to use - * @param config The configuration to use - * @param user_params The index session parameters. If NULL, the default - * session parameters will be used. - * @param zone_count The number of zones for this index to use - * @param new_index A pointer to hold a pointer to the new index - * - * @return UDS_SUCCESS or an error code - **/ -int __must_check allocate_index(struct index_layout *layout, - const struct configuration *config, - const struct uds_parameters *user_params, - unsigned int zone_count, - struct uds_index **new_index); - -/** - * Save an index. The caller must ensure that there are no index requests in - * progress. - * - * @param index The index to save - * - * @return UDS_SUCCESS if successful - **/ -int __must_check save_index(struct uds_index *index); - -/** - * Clean up the index and its memory. - * - * @param index The index to destroy. - **/ -void free_index(struct uds_index *index); - -/** - * Replace the existing index backing store with a different one. - * - * @param index The index - * @param path The path to the new backing store - * - * @return UDS_SUCCESS or an error code - **/ -int __must_check replace_index_storage(struct uds_index *index, - const char *path); - -/** - * Perform the index operation specified by the type field of a UDS request. - * - * For UDS API requests, this searches the index for the chunk name in the - * request. If the chunk name is already present in the index, the location - * field of the request will be set to the uds_index_region where it was - * found. If the action is not DELETE, the old_metadata field of the request - * will also be filled in with the prior metadata for the name. - * - * If the API request type is: - * - * UDS_POST, a record will be added to the open chapter with the metadata - * in the request for new records, and the existing metadata for existing - * records. - * - * UDS_UPDATE, a record will be added to the open chapter with the metadata - * in the request. - * - * UDS_QUERY, if the update flag is set in the request, any record found - * will be moved to the open chapter. In all other cases the contents of - * the index will remain unchanged. - * - * UDS_DELETE, any entry with the name will removed from the index. - * - * @param index The index - * @param request The originating request - * - * @return UDS_SUCCESS, UDS_QUEUED, or an error code - **/ -int __must_check dispatch_index_request(struct uds_index *index, - struct uds_request *request); - -/** - * Internal helper to prepare the index for saving. - * - * @param index the index - * @param checkpoint whether the save is a checkpoint - * @param open_chapter_number the virtual chapter number of the open chapter - **/ -void begin_save(struct uds_index *index, - bool checkpoint, - uint64_t open_chapter_number); - -/** - * Replay the volume file to repopulate the volume index. - * - * @param index The index - * @param from_vcn The virtual chapter to start replaying - * - * @return UDS_SUCCESS if successful - **/ -int __must_check replay_volume(struct uds_index *index, uint64_t from_vcn); - -/** - * Gather statistics from the volume index, volume, and cache. - * - * @param index The index - * @param counters the statistic counters for the index - **/ -void get_index_stats(struct uds_index *index, - struct uds_index_stats *counters); - -/** - * Advance the newest virtual chapter. If this will overwrite the oldest - * virtual chapter, advance that also. - * - * @param index The index to advance - **/ -void advance_active_chapters(struct uds_index *index); - -/** - * Select and return the request queue responsible for executing the next - * index stage of a request, updating the request with any associated state - * (such as the zone number). - * - * @param index The index. - * @param request The request destined for the queue. - * @param next_stage The next request stage. - * - * @return the next index stage queue (the triage queue or the zone queue) - **/ -struct uds_request_queue *select_index_queue(struct uds_index *index, - struct uds_request *request, - enum request_stage next_stage); - -/** - * Wait for the index to finish all operations that access a local storage - * device. - * - * @param index The index - **/ -static INLINE void wait_for_idle_index(struct uds_index *index) -{ - wait_for_idle_chapter_writer(index->chapter_writer); -} - -#endif /* INDEX_H */ diff --git a/uds/indexCheckpoint.c b/uds/indexCheckpoint.c deleted file mode 100644 index 57077a8b..00000000 --- a/uds/indexCheckpoint.c +++ /dev/null @@ -1,397 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/krusty/src/uds/indexCheckpoint.c#20 $ - */ - -#include "indexCheckpoint.h" - -#include "errors.h" -#include "logger.h" -#include "memoryAlloc.h" -#include "permassert.h" -#include "uds-threads.h" -#include "typeDefs.h" - -/** - * index checkpointState values - * - * @note The order of these values is significant, - * see indexState.c doIndexStateCheckpointInZone(). - **/ -enum checkpoint_state { - NOT_CHECKPOINTING, - CHECKPOINT_IN_PROGRESS, - CHECKPOINT_ABORTING -}; - -/** - * Private structure which tracks checkpointing. - **/ -struct index_checkpoint { - struct mutex mutex; // covers this group of fields - uint64_t chapter; // vcn of the starting chapter - enum checkpoint_state state; // is checkpoint in progress or aborting - unsigned int zones_busy; // count of zones not yet done - unsigned int frequency; // number of chapters between checkpoints - uint64_t checkpoints; // number of checkpoints this session -}; - -/** - * Enum return value of index checkpoint trigger function. - **/ -enum index_checkpoint_trigger_value { - ICTV_IDLE, //< no checkpointing right now - ICTV_START, //< start a new checkpoint now - ICTV_CONTINUE, //< continue checkpointing if needed - ICTV_FINISH, //< finish checkpointing, next time will start new cycle - ICTV_ABORT //< immediately abort checkpointing -}; - -typedef int checkpoint_function_t(struct uds_index *index, unsigned int zone); - -// These functions are called while holding the checkpoint->mutex but are -// expected to release it. -// -static checkpoint_function_t do_checkpoint_start; -static checkpoint_function_t do_checkpoint_process; -static checkpoint_function_t do_checkpoint_finish; -static checkpoint_function_t do_checkpoint_abort; - -checkpoint_function_t *const checkpoint_funcs[] = { - NULL, - do_checkpoint_start, - do_checkpoint_process, - do_checkpoint_finish, - do_checkpoint_abort -}; - -/**********************************************************************/ -int make_index_checkpoint(struct uds_index *index) -{ - struct index_checkpoint *checkpoint; - int result = UDS_ALLOCATE(1, - struct index_checkpoint, - "struct index_checkpoint", - &checkpoint); - if (result != UDS_SUCCESS) { - return result; - } - - result = uds_init_mutex(&checkpoint->mutex); - if (result != UDS_SUCCESS) { - UDS_FREE(checkpoint); - return result; - } - - checkpoint->checkpoints = 0; - - index->checkpoint = checkpoint; - return UDS_SUCCESS; -} - -/**********************************************************************/ -void free_index_checkpoint(struct index_checkpoint *checkpoint) -{ - if (checkpoint != NULL) { - uds_destroy_mutex(&checkpoint->mutex); - UDS_FREE(checkpoint); - } -} - -/**********************************************************************/ -unsigned int -get_index_checkpoint_frequency(struct index_checkpoint *checkpoint) -{ - unsigned int frequency; - uds_lock_mutex(&checkpoint->mutex); - frequency = checkpoint->frequency; - uds_unlock_mutex(&checkpoint->mutex); - return frequency; -} - -/**********************************************************************/ -unsigned int -set_index_checkpoint_frequency(struct index_checkpoint *checkpoint, - unsigned int frequency) -{ - unsigned int old_frequency; - uds_lock_mutex(&checkpoint->mutex); - old_frequency = checkpoint->frequency; - checkpoint->frequency = frequency; - uds_unlock_mutex(&checkpoint->mutex); - return old_frequency; -} - -/**********************************************************************/ -uint64_t get_checkpoint_count(struct index_checkpoint *checkpoint) -{ - return checkpoint->checkpoints; -} - -/**********************************************************************/ -static enum index_checkpoint_trigger_value -get_checkpoint_action(struct index_checkpoint *checkpoint, - uint64_t virtual_chapter) -{ - unsigned int value; - if (checkpoint->frequency == 0) { - return ICTV_IDLE; - } - value = virtual_chapter % checkpoint->frequency; - if (checkpoint->state == CHECKPOINT_ABORTING) { - return ICTV_ABORT; - } else if (checkpoint->state == CHECKPOINT_IN_PROGRESS) { - if (value == checkpoint->frequency - 1) { - return ICTV_FINISH; - } else { - return ICTV_CONTINUE; - } - } else { - if (value == 0) { - return ICTV_START; - } else { - return ICTV_IDLE; - } - } -} - -/**********************************************************************/ -int process_checkpointing(struct uds_index *index, - unsigned int zone, - uint64_t new_virtual_chapter) -{ - struct index_checkpoint *checkpoint = index->checkpoint; - checkpoint_function_t *func; - enum index_checkpoint_trigger_value ictv; - uds_lock_mutex(&checkpoint->mutex); - - ictv = get_checkpoint_action(checkpoint, new_virtual_chapter); - - if (ictv == ICTV_START) { - checkpoint->chapter = new_virtual_chapter; - } - - func = checkpoint_funcs[ictv]; - if (func == NULL) { - // nothing to do in idle state - uds_unlock_mutex(&checkpoint->mutex); - return UDS_SUCCESS; - } - - return (*func)(index, zone); -} - -/**********************************************************************/ -int process_chapter_writer_checkpoint_saves(struct uds_index *index) -{ - struct index_checkpoint *checkpoint = index->checkpoint; - - int result = UDS_SUCCESS; - - uds_lock_mutex(&checkpoint->mutex); - if (checkpoint->state == CHECKPOINT_IN_PROGRESS) { - result = perform_index_state_checkpoint_chapter_synchronized_saves(index->state); - - if (result != UDS_SUCCESS) { - checkpoint->state = CHECKPOINT_ABORTING; - uds_log_info("checkpoint failed"); - index->last_checkpoint = index->prev_checkpoint; - } - } - - uds_unlock_mutex(&checkpoint->mutex); - return result; -} - -/** - * Helper function used to abort checkpoint if an error has occurred. - * - * @param index the index - * @param result the error result - * - * @return result - **/ -static int abort_checkpointing(struct uds_index *index, int result) -{ - if (index->checkpoint->state != NOT_CHECKPOINTING) { - index->checkpoint->state = CHECKPOINT_ABORTING; - uds_log_info("checkpoint failed"); - index->last_checkpoint = index->prev_checkpoint; - } - return result; -} - -/**********************************************************************/ -int finish_checkpointing(struct uds_index *index) -{ - unsigned int z; - struct index_checkpoint *checkpoint = index->checkpoint; - - int result = process_chapter_writer_checkpoint_saves(index); - if (result != UDS_SUCCESS) { - return result; - } - - uds_lock_mutex(&checkpoint->mutex); - - for (z = 0; z < index->zone_count; ++z) { - if (checkpoint->state != CHECKPOINT_IN_PROGRESS) { - break; - } - result = do_checkpoint_finish(index, z); - // reacquire mutex released by do_checkpoint_finish - uds_lock_mutex(&checkpoint->mutex); - if (result != UDS_SUCCESS) { - break; - } - } - - if ((result == UDS_SUCCESS) && - (checkpoint->state == CHECKPOINT_IN_PROGRESS)) { - result = finish_index_state_checkpoint(index->state); - if (result == UDS_SUCCESS) { - checkpoint->state = NOT_CHECKPOINTING; - } - } - - uds_unlock_mutex(&checkpoint->mutex); - return result; -} - -/** - * Starts an incremental checkpoint. - * - * Called by the first zone to finish a chapter which starts a checkpoint. - * - * @param index the index - * @param zone the zone number - * - * @return UDS_SUCCESS or an error code - **/ -static int do_checkpoint_start(struct uds_index *index, unsigned int zone) -{ - int result; - struct index_checkpoint *checkpoint = index->checkpoint; - begin_save(index, true, checkpoint->chapter); - result = start_index_state_checkpoint(index->state); - if (result != UDS_SUCCESS) { - uds_log_error_strerror(result, - "cannot start index checkpoint"); - index->last_checkpoint = index->prev_checkpoint; - uds_unlock_mutex(&checkpoint->mutex); - return result; - } - - checkpoint->state = CHECKPOINT_IN_PROGRESS; - checkpoint->zones_busy = index->zone_count; - - return do_checkpoint_process(index, zone); -} - -/**********************************************************************/ -static int do_checkpoint_process(struct uds_index *index, unsigned int zone) -{ - struct index_checkpoint *checkpoint = index->checkpoint; - enum completion_status status = CS_NOT_COMPLETED; - int result; - uds_unlock_mutex(&checkpoint->mutex); - result = perform_index_state_checkpoint_in_zone(index->state, zone, - &status); - if (result != UDS_SUCCESS) { - uds_lock_mutex(&checkpoint->mutex); - uds_log_error_strerror(result, - "cannot continue index checkpoint"); - result = abort_checkpointing(index, result); - uds_unlock_mutex(&checkpoint->mutex); - } else if (status == CS_JUST_COMPLETED) { - uds_lock_mutex(&checkpoint->mutex); - if (--checkpoint->zones_busy == 0) { - checkpoint->checkpoints += 1; - uds_log_info("finished checkpoint"); - result = finish_index_state_checkpoint(index->state); - if (result != UDS_SUCCESS) { - uds_log_error_strerror(result, - "%s checkpoint finish failed", - __func__); - } - checkpoint->state = NOT_CHECKPOINTING; - } - uds_unlock_mutex(&checkpoint->mutex); - } - return result; -} - -/**********************************************************************/ -static int do_checkpoint_abort(struct uds_index *index, unsigned int zone) -{ - struct index_checkpoint *checkpoint = index->checkpoint; - enum completion_status status = CS_NOT_COMPLETED; - int result = abort_index_state_checkpoint_in_zone(index->state, zone, - &status); - if (result != UDS_SUCCESS) { - uds_log_error_strerror(result, - "cannot abort index checkpoint"); - } else if (status == CS_JUST_COMPLETED) { - if (--checkpoint->zones_busy == 0) { - uds_log_info("aborted checkpoint"); - result = abort_index_state_checkpoint(index->state); - if (result != UDS_SUCCESS) { - uds_log_error_strerror(result, - "checkpoint abort failed"); - } - checkpoint->state = NOT_CHECKPOINTING; - } - } - uds_unlock_mutex(&checkpoint->mutex); - - return result; -} - -/**********************************************************************/ -static int do_checkpoint_finish(struct uds_index *index, unsigned int zone) -{ - struct index_checkpoint *checkpoint = index->checkpoint; - enum completion_status status = CS_NOT_COMPLETED; - int result; - uds_unlock_mutex(&checkpoint->mutex); - result = finish_index_state_checkpoint_in_zone(index->state, zone, - &status); - if (result != UDS_SUCCESS) { - uds_log_error_strerror(result, - "cannot finish index checkpoint"); - uds_lock_mutex(&checkpoint->mutex); - result = abort_checkpointing(index, result); - uds_unlock_mutex(&checkpoint->mutex); - } else if (status == CS_JUST_COMPLETED) { - uds_lock_mutex(&checkpoint->mutex); - if (--checkpoint->zones_busy == 0) { - checkpoint->checkpoints += 1; - uds_log_info("finished checkpoint"); - result = finish_index_state_checkpoint(index->state); - if (result != UDS_SUCCESS) { - uds_log_error_strerror(result, - "%s checkpoint finish failed", - __func__); - } - checkpoint->state = NOT_CHECKPOINTING; - } - uds_unlock_mutex(&checkpoint->mutex); - } - return result; -} diff --git a/uds/indexCheckpoint.h b/uds/indexCheckpoint.h deleted file mode 100644 index a98c68ea..00000000 --- a/uds/indexCheckpoint.h +++ /dev/null @@ -1,121 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/krusty/src/uds/indexCheckpoint.h#6 $ - */ - -#ifndef INDEX_CHECKPOINT_H -#define INDEX_CHECKPOINT_H - -#include "index.h" - -/** - * Construct and initialize the checkpoint sub-structure of an index. - * - * @param index the index receive the new checkpoint structure. - * - * @return UDS_SUCCESS or an error code - **/ -int __must_check make_index_checkpoint(struct uds_index *index); - -/** - * Free the checkpoint sub-structure of an index. - * - * @param checkpoint the structure to free - **/ -void free_index_checkpoint(struct index_checkpoint *checkpoint); - -/** - * Get the current checkpointing frequency of an index. - * - * @param checkpoint the checkpoint state of the index - * - * @return the number of chapters between checkpoints - **/ -unsigned int __must_check -get_index_checkpoint_frequency(struct index_checkpoint *checkpoint); - -/** - * Set checkpointing frequency for the index. - * - * @param checkpoint the checkpoint state of the index - * @param frequency The new checkpointing frequency - * - * @return the old checkpointing frequency - **/ -unsigned int -set_index_checkpoint_frequency(struct index_checkpoint *checkpoint, - unsigned int frequency); - -/** - * Gets the number of checkpoints completed during the lifetime of this index - * - * @param checkpoint the checkpoint state of the index - * - * @return the number of checkpoints completed - **/ -uint64_t __must_check -get_checkpoint_count(struct index_checkpoint *checkpoint); - -/** - * If incremental checkpointing is in progress, finish it. - * - * @param index The index - * - * @return UDS_SUCCESS or an error code - * - * @note This function is called automatically during normal operation; - * its presence here is for tests that expect checkpointing to - * have completed at some point in their logic. It is not an - * error to call this function if checkpointing is not in - * progress, it silently returns success. - **/ -int __must_check finish_checkpointing(struct uds_index *index); - -/** - * Process one zone's incremental checkpoint operation. Automatically - * starts, processes, and finishes a checkpoint over multiple invocations - * as successive chapters are closed and written. - * - * Uses its own mutex to serialize the starting and finishing or aborting, - * but allows parallel execution of the incremental progress. - * - * @param index The index to checkpoint - * @param zone The current zone number - * @param new_virtual_chapter The number of the chapter which the calling - * zone has just opened - * - * @return UDS_SUCCESS or an error code. - **/ -int __must_check process_checkpointing(struct uds_index *index, - unsigned int zone, - uint64_t new_virtual_chapter); - -/** - * Process saves done outside any zone by the chapter writer. - * - * Grabs the mutex associated with process_checkpointing(). - * - * @param index The index to process. - * - * @return UDS_SUCCESS or an error code. - **/ -int __must_check -process_chapter_writer_checkpoint_saves(struct uds_index *index); - -#endif // INDEX_CHECKPOINT_H diff --git a/uds/indexComponent.c b/uds/indexComponent.c deleted file mode 100644 index 83810f21..00000000 --- a/uds/indexComponent.c +++ /dev/null @@ -1,782 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/krusty/src/uds/indexComponent.c#24 $ - */ - -#include "indexComponent.h" - -#include "compiler.h" -#include "errors.h" -#include "indexLayout.h" -#include "indexState.h" -#include "logger.h" -#include "memoryAlloc.h" -#include "permassert.h" -#include "typeDefs.h" - -/**********************************************************************/ -int make_index_component(struct index_state *state, - const struct index_component_info *info, - unsigned int zone_count, - void *data, - void *context, - struct index_component **component_ptr) -{ - struct index_component *component = NULL; - int result; - if ((info == NULL) || (info->name == NULL)) { - return uds_log_error_strerror(UDS_INVALID_ARGUMENT, - "invalid component or directory specified"); - } - if (info->loader == NULL) { - return uds_log_error_strerror(UDS_INVALID_ARGUMENT, - "no .loader function specified for component %s", - info->name); - } - if ((info->saver == NULL) && (info->incremental == NULL)) { - return uds_log_error_strerror(UDS_INVALID_ARGUMENT, - "neither .saver function nor .incremental function specified for component %s", - info->name); - } - - result = UDS_ALLOCATE(1, struct index_component, "index component", - &component); - if (result != UDS_SUCCESS) { - return result; - } - - component->component_data = data; - component->context = context; - component->info = info; - component->num_zones = info->multi_zone ? zone_count : 1; - component->state = state; - component->write_zones = NULL; - *component_ptr = component; - return UDS_SUCCESS; -} - -/**********************************************************************/ -static void free_write_zones(struct index_component *component) -{ - if (component->write_zones != NULL) { - unsigned int z; - for (z = 0; z < component->num_zones; ++z) { - struct write_zone *wz = component->write_zones[z]; - if (wz == NULL) { - continue; - } - free_buffered_writer(wz->writer); - UDS_FREE(wz); - } - UDS_FREE(component->write_zones); - component->write_zones = NULL; - } -} - -/**********************************************************************/ -void free_index_component(struct index_component *component) -{ - if (component == NULL) { - return; - } - - free_write_zones(component); - UDS_FREE(component); -} - -/** - * Destroy, deallocate, and expunge a read portal. - * - * @param read_portal the readzone array - **/ -static void free_read_portal(struct read_portal *read_portal) -{ - unsigned int z; - if (read_portal == NULL) { - return; - } - for (z = 0; z < read_portal->zones; ++z) { - if (read_portal->readers[z] != NULL) { - free_buffered_reader(read_portal->readers[z]); - } - } - UDS_FREE(read_portal->readers); - UDS_FREE(read_portal); -} - -/**********************************************************************/ -int get_buffered_reader_for_portal(struct read_portal *portal, - unsigned int part, - struct buffered_reader **reader_ptr) -{ - struct index_component *component; - if (part >= portal->zones) { - return uds_log_error_strerror(UDS_INVALID_ARGUMENT, - "%s: cannot access zone %u of %u", - __func__, - part, - portal->zones); - } - component = portal->component; - if (component->info->io_storage && (portal->readers[part] == NULL)) { - int result = open_state_buffered_reader(component->state, - component->info->kind, - part, - &portal->readers[part]); - if (result != UDS_SUCCESS) { - return uds_log_error_strerror(result, - "%s: cannot make buffered reader for zone %u", - __func__, - part); - } - } - *reader_ptr = portal->readers[part]; - return UDS_SUCCESS; -} - -/**********************************************************************/ -int read_index_component(struct index_component *component) -{ - struct read_portal *portal; - int read_zones, result; - result = UDS_ALLOCATE(1, struct read_portal, - "index component read portal", &portal); - if (result != UDS_SUCCESS) { - return result; - } - read_zones = component->state->load_zones; - result = UDS_ALLOCATE(read_zones, - struct buffered_reader *, - "read zone buffered readers", - &portal->readers); - if (result != UDS_SUCCESS) { - UDS_FREE(portal); - return result; - } - - portal->component = component; - portal->zones = read_zones; - result = (*component->info->loader)(portal); - free_read_portal(portal); - return result; -} - -/** - * Determine the write_zone structure for the specified component and zone. - * - * @param [in] component the index component - * @param [in] zone the zone number - * @param [out] write_zone_ptr the resulting write zone instance - * - * @return UDS_SUCCESS or an error code - **/ -static int resolve_write_zone(const struct index_component *component, - unsigned int zone, - struct write_zone **write_zone_ptr) -{ - int result = ASSERT(write_zone_ptr != NULL, "output parameter is null"); - if (result != UDS_SUCCESS) { - return result; - } - - if (component->write_zones == NULL) { - return uds_log_error_strerror(UDS_BAD_STATE, - "cannot resolve index component write zone: not allocated"); - } - - if (zone >= component->num_zones) { - return uds_log_error_strerror(UDS_INVALID_ARGUMENT, - "cannot resolve index component write zone: zone out of range"); - } - *write_zone_ptr = component->write_zones[zone]; - return UDS_SUCCESS; -} - -/** - * Non-incremental save function used to emulate a regular save - * using an incremental save function as a basis. - * - * @param component the index component - * @param writer the buffered writer - * @param zone the zone number - * - * @return UDS_SUCCESS or an error code - **/ -static int -index_component_saver_incremental_wrapper(struct index_component *component, - struct buffered_writer *writer, - unsigned int zone) -{ - incremental_writer_t incr_func = component->info->incremental; - bool completed = false; - - int result = (*incr_func)(component, writer, zone, IWC_START, - &completed); - if (result != UDS_SUCCESS) { - return result; - } - - if (!completed) { - result = (*incr_func)(component, writer, zone, IWC_FINISH, - &completed); - if (result != UDS_SUCCESS) { - return result; - } - } - - result = flush_buffered_writer(writer); - if (result != UDS_SUCCESS) { - return result; - } - - return UDS_SUCCESS; -} - -/** - * Specify that writing to a specific zone file has finished. - * - * If a syncer has been registered with the index component, the file - * descriptor will be enqueued upon it for fsyncing and closing. - * If not, or if the enqueue fails, the file will be fsynced and closed - * immediately. - * - * @param write_zone the index component write zone - * - * @return UDS_SUCCESS or an error code - **/ -static int done_with_zone(struct write_zone *write_zone) -{ - const struct index_component *component = write_zone->component; - if (write_zone->writer != NULL) { - int result = flush_buffered_writer(write_zone->writer); - if (result != UDS_SUCCESS) { - return uds_log_error_strerror(result, - "cannot flush buffered writer for %s component (zone %u)", - component->info->name, - write_zone->zone); - } - } - return UDS_SUCCESS; -} - -/** - * Construct the array of write_zone instances for this component. - * - * @param component the index component - * - * @return UDS_SUCCESS or an error code - * - * If this is a multizone component, each zone will be fully defined, - * otherwise zone 0 stands in for the single state file. - **/ -static int make_write_zones(struct index_component *component) -{ - unsigned int z; - int result; - - if (component->write_zones != NULL) { - // just reinitialize states - for (z = 0; z < component->num_zones; ++z) { - struct write_zone *wz = component->write_zones[z]; - wz->phase = IWC_IDLE; - } - return UDS_SUCCESS; - } - - result = UDS_ALLOCATE(component->num_zones, - struct write_zone *, - "index component write zones", - &component->write_zones); - if (result != UDS_SUCCESS) { - return result; - } - - for (z = 0; z < component->num_zones; ++z) { - result = UDS_ALLOCATE(1, - struct write_zone, - "plain write zone", - &component->write_zones[z]); - if (result != UDS_SUCCESS) { - free_write_zones(component); - return result; - } - *component->write_zones[z] = (struct write_zone){ - .component = component, - .phase = IWC_IDLE, - .zone = z, - }; - } - return UDS_SUCCESS; -} - -/**********************************************************************/ -static int open_buffered_writers(struct index_component *component) -{ - int result = UDS_SUCCESS; - struct write_zone **wzp; - for (wzp = component->write_zones; - wzp < component->write_zones + component->num_zones; - ++wzp) { - struct write_zone *wz = *wzp; - wz->phase = IWC_START; - - result = ASSERT(wz->writer == NULL, - "write zone writer already exists"); - if (result != UDS_SUCCESS) { - return result; - } - - if (component->info->io_storage) { - result = - open_state_buffered_writer(component->state, - component->info->kind, - wz->zone, - &wz->writer); - if (result != UDS_SUCCESS) { - return result; - } - } - } - return UDS_SUCCESS; -} - -/**********************************************************************/ -static int start_index_component_save(struct index_component *component) -{ - int result = make_write_zones(component); - if (result != UDS_SUCCESS) { - return result; - } - - result = open_buffered_writers(component); - if (result != UDS_SUCCESS) { - return result; - } - - return UDS_SUCCESS; -} - -/**********************************************************************/ -int start_index_component_incremental_save(struct index_component *component) -{ - return start_index_component_save(component); -} - -/**********************************************************************/ -int write_index_component(struct index_component *component) -{ - int result; - unsigned int z; - saver_t saver = component->info->saver; - if ((saver == NULL) && (component->info->incremental != NULL)) { - saver = index_component_saver_incremental_wrapper; - } - - result = start_index_component_save(component); - if (result != UDS_SUCCESS) { - return result; - } - - for (z = 0; z < component->num_zones; ++z) { - struct write_zone *write_zone = component->write_zones[z]; - - result = (*saver)(component, write_zone->writer, z); - if (result != UDS_SUCCESS) { - break; - } - - result = done_with_zone(write_zone); - if (result != UDS_SUCCESS) { - break; - } - - free_buffered_writer(write_zone->writer); - write_zone->writer = NULL; - } - - if (result != UDS_SUCCESS) { - free_write_zones(component); - return uds_log_error_strerror(result, - "index component write failed"); - } - - return UDS_SUCCESS; -} - -/** - * Close a specific buffered writer in a component write zone. - * - * @param write_zone the write zone - * - * @return UDS_SUCCESS or an error code - * - * @note closing a buffered writer causes its file descriptor to be - * passed to done_with_zone - **/ -static int close_buffered_writer(struct write_zone *write_zone) -{ - int result; - - if (write_zone->writer == NULL) { - return UDS_SUCCESS; - } - - result = done_with_zone(write_zone); - free_buffered_writer(write_zone->writer); - write_zone->writer = NULL; - - return result; -} - -/** - * Faux incremental saver function for index components which only define - * a simple saver. Conforms to incremental_writer_t signature. - * - * @param [in] component the index component - * @param [in] writer the buffered writer that does the output - * @param [in] zone the zone number - * @param [in] command the incremental writer command - * @param [out] completed if non-NULL, set to whether the save is complete - * - * @return UDS_SUCCESS or an error code - * - * @note This wrapper always calls the non-incremental saver when - * the IWC_START command is issued, and always reports that - * the save is complete unless the saver failed. - **/ -static int wrap_saver_as_incremental(struct index_component *component, - struct buffered_writer *writer, - unsigned int zone, - enum incremental_writer_command command, - bool *completed) -{ - int result = UDS_SUCCESS; - - if ((command >= IWC_START) && (command <= IWC_FINISH)) { - result = (*component->info->saver)(component, writer, zone); - if ((result == UDS_SUCCESS) && (writer != NULL)) { - note_buffered_writer_used(writer); - } - } - if ((result == UDS_SUCCESS) && (completed != NULL)) { - *completed = true; - } - return result; -} - -/** - * Return the appropriate incremental writer function depending on - * the component's type and whether this is the first zone. - * - * @param component the index component - * - * @return the correct incremental_writer_t function to use, or - * NULL signifying no progress can be made at this time. - **/ -static incremental_writer_t -get_incremental_writer(struct index_component *component) -{ - incremental_writer_t incr_func = component->info->incremental; - - if (incr_func == NULL) { - incr_func = &wrap_saver_as_incremental; - } - - return incr_func; -} - -/**********************************************************************/ -int perform_index_component_zone_save(struct index_component *component, - unsigned int zone, - enum completion_status *completed) -{ - enum completion_status comp = CS_NOT_COMPLETED; - - struct write_zone *wz = NULL; - int result = resolve_write_zone(component, zone, &wz); - if (result != UDS_SUCCESS) { - return result; - } - - if (wz->phase == IWC_IDLE) { - comp = CS_COMPLETED_PREVIOUSLY; - } else if (wz->phase == IWC_DONE) { - comp = CS_JUST_COMPLETED; - wz->phase = IWC_IDLE; - } else if (!component->info->chapter_sync) { - bool done = false; - incremental_writer_t incr_func = - get_incremental_writer(component); - int result = (*incr_func)(component, wz->writer, zone, - wz->phase, &done); - if (result != UDS_SUCCESS) { - if (wz->phase == IWC_ABORT) { - wz->phase = IWC_IDLE; - } else { - wz->phase = IWC_ABORT; - } - return result; - } - if (done) { - comp = CS_JUST_COMPLETED; - wz->phase = IWC_IDLE; - } else if (wz->phase == IWC_START) { - wz->phase = IWC_CONTINUE; - } - } - - if (completed != NULL) { - *completed = comp; - } - return UDS_SUCCESS; -} - -/**********************************************************************/ -int -perform_index_component_chapter_writer_save(struct index_component *component) -{ - struct write_zone *wz = NULL; - int result = resolve_write_zone(component, 0, &wz); - if (result != UDS_SUCCESS) { - return result; - } - - if ((wz->phase != IWC_IDLE) && (wz->phase != IWC_DONE)) { - bool done = false; - incremental_writer_t incr_func = - get_incremental_writer(component); - int result = ASSERT(incr_func != NULL, "no writer function"); - if (result != UDS_SUCCESS) { - return result; - } - result = (*incr_func)(component, wz->writer, 0, wz->phase, - &done); - if (result != UDS_SUCCESS) { - if (wz->phase == IWC_ABORT) { - wz->phase = IWC_IDLE; - } else { - wz->phase = IWC_ABORT; - } - return result; - } - if (done) { - wz->phase = IWC_DONE; - } else if (wz->phase == IWC_START) { - wz->phase = IWC_CONTINUE; - } - } - return UDS_SUCCESS; -} - -/**********************************************************************/ -int finish_index_component_zone_save(struct index_component *component, - unsigned int zone, - enum completion_status *completed) -{ - struct write_zone *wz = NULL; - enum completion_status comp; - incremental_writer_t incr_func; - int result = resolve_write_zone(component, zone, &wz); - if (result != UDS_SUCCESS) { - return result; - } - - switch (wz->phase) { - case IWC_IDLE: - comp = CS_COMPLETED_PREVIOUSLY; - break; - - case IWC_DONE: - comp = CS_JUST_COMPLETED; - break; - - default: - comp = CS_NOT_COMPLETED; - } - - incr_func = get_incremental_writer(component); - if ((wz->phase >= IWC_START) && (wz->phase < IWC_ABORT)) { - bool done = false; - int result = (*incr_func)(component, wz->writer, zone, - IWC_FINISH, &done); - if (result != UDS_SUCCESS) { - wz->phase = IWC_ABORT; - return result; - } - if (!done) { - uds_log_warning("finish incremental save did not complete for %s zone %u", - component->info->name, - zone); - return UDS_CHECKPOINT_INCOMPLETE; - } - wz->phase = IWC_IDLE; - comp = CS_JUST_COMPLETED; - } - - if (completed != NULL) { - *completed = comp; - } - return UDS_SUCCESS; -} - -/**********************************************************************/ -int finish_index_component_incremental_save(struct index_component *component) -{ - unsigned int zone; - int result; - for (zone = 0; zone < component->num_zones; ++zone) { - struct write_zone *wz = component->write_zones[zone]; - incremental_writer_t incr_func = - get_incremental_writer(component); - if ((wz->phase != IWC_IDLE) && (wz->phase != IWC_DONE)) { - // Note: this is only safe if no other threads are - // currently processing this particular index - bool done = false; - int result = (*incr_func)(component, wz->writer, zone, - IWC_FINISH, &done); - if (result != UDS_SUCCESS) { - return result; - } - if (!done) { - uds_log_warning("finishing incremental save did not complete for %s zone %u", - component->info->name, - zone); - return UDS_UNEXPECTED_RESULT; - } - wz->phase = IWC_IDLE; - } - - if ((wz->writer != NULL) && - !was_buffered_writer_used(wz->writer)) { - return uds_log_error_strerror(UDS_CHECKPOINT_INCOMPLETE, - "component %s zone %u did not get written", - component->info->name, - zone); - } - - result = close_buffered_writer(wz); - if (result != UDS_SUCCESS) { - return result; - } - } - - return UDS_SUCCESS; -} - -/**********************************************************************/ -int abort_index_component_zone_save(struct index_component *component, - unsigned int zone, - enum completion_status *status) -{ - enum completion_status comp = CS_COMPLETED_PREVIOUSLY; - incremental_writer_t incr_func; - - struct write_zone *wz = NULL; - int result = resolve_write_zone(component, zone, &wz); - if (result != UDS_SUCCESS) { - return result; - } - - incr_func = get_incremental_writer(component); - if ((wz->phase != IWC_IDLE) && (wz->phase != IWC_DONE)) { - result = (*incr_func)(component, wz->writer, zone, IWC_ABORT, - NULL); - wz->phase = IWC_IDLE; - if (result != UDS_SUCCESS) { - return result; - } - comp = CS_JUST_COMPLETED; - } - - if (status != NULL) { - *status = comp; - } - return UDS_SUCCESS; -} - -/**********************************************************************/ -int abort_index_component_incremental_save(struct index_component *component) -{ - int result = UDS_SUCCESS; - unsigned int zone; - for (zone = 0; zone < component->num_zones; ++zone) { - struct write_zone *wz = component->write_zones[zone]; - incremental_writer_t incr_func = - get_incremental_writer(component); - if ((wz->phase != IWC_IDLE) && (wz->phase != IWC_DONE)) { - // Note: this is only safe if no other threads are - // currently processing this particular index - result = (*incr_func)(component, wz->writer, zone, - IWC_ABORT, NULL); - wz->phase = IWC_IDLE; - if (result != UDS_SUCCESS) { - return result; - } - } - - result = close_buffered_writer(wz); - if (result != UDS_SUCCESS) { - return result; - } - } - - return UDS_SUCCESS; -} - -/**********************************************************************/ -int discard_index_component(struct index_component *component) -{ - unsigned int num_zones = 0, save_slot = 0, old_save_slot, z; - int result; - if (!component->info->io_storage) { - return UDS_INVALID_ARGUMENT; - } - - result = find_latest_uds_index_save_slot(component->state->layout, - &num_zones, &save_slot); - if (result != UDS_SUCCESS) { - return result; - } - - old_save_slot = component->state->save_slot; - component->state->save_slot = save_slot; - - for (z = 0; z < num_zones; ++z) { - struct buffered_writer *writer; - int result = open_state_buffered_writer(component->state, - component->info->kind, - z, &writer); - if (result != UDS_SUCCESS) { - break; - } - result = - write_zeros_to_buffered_writer(writer, UDS_BLOCK_SIZE); - if (result != UDS_SUCCESS) { - break; - } - result = flush_buffered_writer(writer); - if (result != UDS_SUCCESS) { - break; - } - free_buffered_writer(writer); - } - - component->state->save_slot = old_save_slot; - return result; -} diff --git a/uds/indexComponent.h b/uds/indexComponent.h deleted file mode 100644 index 760ac3dc..00000000 --- a/uds/indexComponent.h +++ /dev/null @@ -1,370 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/krusty/src/uds/indexComponent.h#14 $ - */ - -#ifndef INDEX_COMPONENT_H -#define INDEX_COMPONENT_H 1 - -#include "common.h" - -#include "bufferedReader.h" -#include "bufferedWriter.h" -#include "compiler.h" -#include "regionIdentifiers.h" - -enum completion_status { - CS_NOT_COMPLETED, // operation has not completed - CS_JUST_COMPLETED, // operation just completed - CS_COMPLETED_PREVIOUSLY // operation completed previously -}; - -struct read_portal { - struct index_component *component; - struct buffered_reader **readers; - unsigned int zones; -}; - -/** - * Prototype for functions which can load an index component from its - * saved state. - * - * @param portal A component portal which can be used to load the - * specified component. - * @return UDS_SUCCESS or an error code - **/ -typedef int (*loader_t)(struct read_portal *portal); - -/** - * Prototype for functions which can save an index component. - * - * @param component The index component. - * @param writer A buffered writer. - * @param zone The zone number. - * - * @return UDS_SUCCESS or an error code - **/ -typedef int (*saver_t)(struct index_component *component, - struct buffered_writer *writer, - unsigned int zone); - -/** - * Command code used by incremental_writer_t function protocol. - **/ -enum incremental_writer_command { - IWC_START, //< start an incremental save - IWC_CONTINUE, //< continue an incremental save - IWC_FINISH, //< force finish of incremental save - IWC_ABORT, //< abort incremental save - IWC_IDLE = -1, //< not a command, internally signifies not in progress - IWC_DONE = -2 //< not a command, internally signifies async completion -}; - -struct write_zone { - struct index_component *component; - enum incremental_writer_command phase; - struct buffered_writer *writer; - unsigned int zone; -}; - -/** - * @param [in] component The index component. - * @param [in] writer A buffered writer. - * @param [in] zone The zone number (0 for non-multi-zone). - * @param [in] command The incremental writer command. - * @param [out] completed If non-NULL, set to whether save is done. - * - * @return UDS_SUCCESS or an error code - **/ -typedef int (*incremental_writer_t)(struct index_component *component, - struct buffered_writer *writer, - unsigned int zone, - enum incremental_writer_command command, - bool *completed); - -/** - * The structure describing how to load or save an index component. - * At least one of saver or incremental must be specified. - **/ -struct index_component_info { - enum region_kind kind; // Region kind - const char *name; // The name of the component - // (for logging) - bool save_only; // Used for saves but not checkpoints - bool chapter_sync; // Saved by the chapter writer - bool multi_zone; // Does this component have multiple - // zones? - bool io_storage; // Do we do I/O directly to storage? - loader_t loader; // The function load this component - saver_t saver; // The function to store this - // component - incremental_writer_t incremental; // The function for incremental - // writing -}; - -/** - * The structure representing a savable (and loadable) part of an index. - **/ -struct index_component { - const struct index_component_info *info; // index_component_info - // specification - void *component_data; // The object to load or - // save - void *context; // The context used to - // load or save - struct index_state *state; // The index state - unsigned int num_zones; // Number of zones in - // write portal - struct write_zone **write_zones; // State for writing - // component -}; - -/** - * Make an index component - * - * @param state The index state in which this component instance - * shall reside. - * @param info The component info specification for this component. - * @param zone_count How many active zones are in use. - * @param data Component-specific data. - * @param context Component-specific context. - * @param component_ptr Where to store the resulting component. - * - * @return UDS_SUCCESS or an error code - **/ -int __must_check make_index_component(struct index_state *state, - const struct index_component_info *info, - unsigned int zone_count, - void *data, - void *context, - struct index_component **component_ptr); - -/** - * Destroy an index component. - * - * @param component The component to be freed - **/ -void free_index_component(struct index_component *component); - -/** - * Return the index component name for this component. - **/ -static INLINE const char * -index_component_name(struct index_component *component) -{ - return component->info->name; -} - -/** - * Return the index component data for this component. - **/ -static INLINE void *index_component_data(struct index_component *component) -{ - return component->component_data; -} - -/** - * Return the index component context for this component. - **/ -static INLINE void *index_component_context(struct index_component *component) -{ - return component->context; -} - -/** - * Determine whether this component may be skipped for a checkpoint. - * - * @param component the component, - * - * @return whether the component may be skipped - **/ -static INLINE bool -skip_index_component_on_checkpoint(struct index_component *component) -{ - return component->info->save_only; -} - -/** - * Determine whether actual saving during a checkpoint should be - * invoked by the chapter writer thread. - **/ -static INLINE bool -defer_index_component_checkpoint_to_chapter_writer(struct index_component *component) -{ - return component->info->chapter_sync; -} - -/** - * Determine whether a replay is required if component is missing. - * - * @param component the component - * - * @return whether the component is final (that is, contains shutdown state) - **/ -static INLINE bool -missing_index_component_requires_replay(struct index_component *component) -{ - return component->info->save_only; -} - -/** - * Read a component's state. - * - * @param component The component to read. - * - * @return UDS_SUCCESS, an error code from reading, or UDS_INVALID_ARGUMENT - * if the component is NULL. - **/ -int __must_check read_index_component(struct index_component *component); - -/** - * Write a state file. - * - * @param component The component to write - * - * @return UDS_SUCCESS, an error code from writing, or UDS_INVALID_ARGUMENT - * if the component is NULL. - **/ -int __must_check write_index_component(struct index_component *component); - -/** - * Start an incremental save for this component (all zones). - * - * @param [in] component The index component. - * - * @return UDS_SUCCESS or an error code. - **/ -int __must_check -start_index_component_incremental_save(struct index_component *component); - -/** - * Perform an incremental save for a component in a particular zone. - * - * @param [in] component The index component. - * @param [in] zone The zone number. - * @param [out] completed Pointer to hold completion status result. - * - * @return UDS_SUCCESS or an error code. - * - * @note If an incremental save is not supported, a regular - * save will be performed if this is the first call in zone 0. - **/ -int __must_check -perform_index_component_zone_save(struct index_component *component, - unsigned int zone, - enum completion_status *completed); - -/** - * Perform an incremental save for a non-multizone component synchronized - * with the chapter writer. - * - * @param component The index component. - **/ -int __must_check -perform_index_component_chapter_writer_save(struct index_component *component); - -/** - * Force the completion of an incremental save currently in progress in - * a particular zone. - * - * @param [in] component The index component. - * @param [in] zone The zone number. - * @param [out] completed Pointer to hold completion status result. - * - * @return UDS_SUCCESS or an error code. - **/ -int __must_check -finish_index_component_zone_save(struct index_component *component, - unsigned int zone, - enum completion_status *completed); - -/** - * Force the completion of an incremental save in all zones and complete - * the overal save. - * - * @param [in] component The index component. - * - * @return UDS_SUCCESS or an error code. - * - * @note If all zones call finish_index_component_zone_save first, only - * the common non-index-related completion code is required, - * which protects access to the index data structures from the - * invoking thread. - **/ -int __must_check -finish_index_component_incremental_save(struct index_component *component); - -/** - * Abort the incremental save currently in progress in a particular zone. - * - * @param [in] component The index component. - * @param [in] zone The zone number. - * @param [out] completed Pointer to hold completion status result. - * - * @return UDS_SUCCESS or an error code. - * - * @note "Completed" in this case means completed or aborted. - * Once any zone calls this function the entire save is - * useless unless every zone indicates CS_COMPLETED_PREVIOUSLY. - **/ -int __must_check -abort_index_component_zone_save(struct index_component *component, - unsigned int zone, - enum completion_status *completed); - -/** - * Abort an incremental save currently in progress - * - * @param [in] component The index component. - * - * @return UDS_SUCCESS or an error code. - * - * @note If all zones call abort_index_component_zone_save first, only - * the common non-index-related completion code is required, - * which protects access to the index data structures from the - * invoking thread. - **/ -int __must_check -abort_index_component_incremental_save(struct index_component *component); - -/** - * Remove or invalidate component state. - * - * @param component The component whose file is to be removed. If NULL - * no action is taken. - **/ -int __must_check discard_index_component(struct index_component *component); - -/** - * Get a buffered reader for the specified component part. - * - * @param [in] portal The component portal. - * @param [in] part The component ordinal number. - * @param [out] reader_ptr Where to put the buffered reader. - * - * @return UDS_SUCCESS or an error code. - * - * @note the reader is managed by the component portal - **/ -int __must_check -get_buffered_reader_for_portal(struct read_portal *portal, - unsigned int part, - struct buffered_reader **reader_ptr); - -#endif /* INDEX_COMPONENT_H */ diff --git a/uds/indexConfig.c b/uds/indexConfig.c deleted file mode 100644 index 154f5fb4..00000000 --- a/uds/indexConfig.c +++ /dev/null @@ -1,456 +0,0 @@ - /* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/krusty/src/uds/indexConfig.c#26 $ - */ - -#include "indexConfig.h" - -#include "buffer.h" -#include "logger.h" -#include "memoryAlloc.h" - -static const byte INDEX_CONFIG_MAGIC[] = "ALBIC"; -static const byte INDEX_CONFIG_VERSION_6_02[] = "06.02"; -static const byte INDEX_CONFIG_VERSION_8_02[] = "08.02"; - -enum { - INDEX_CONFIG_MAGIC_LENGTH = sizeof(INDEX_CONFIG_MAGIC) - 1, - INDEX_CONFIG_VERSION_LENGTH = sizeof(INDEX_CONFIG_VERSION_6_02) - 1 -}; - -/**********************************************************************/ -static int __must_check -decode_index_config_06_02(struct buffer *buffer, - struct uds_configuration *config) -{ - int result = - get_uint32_le_from_buffer(buffer, - &config->record_pages_per_chapter); - if (result != UDS_SUCCESS) { - return result; - } - result = - get_uint32_le_from_buffer(buffer, - &config->chapters_per_volume); - if (result != UDS_SUCCESS) { - return result; - } - result = - get_uint32_le_from_buffer(buffer, - &config->sparse_chapters_per_volume); - if (result != UDS_SUCCESS) { - return result; - } - result = get_uint32_le_from_buffer(buffer, &config->cache_chapters); - if (result != UDS_SUCCESS) { - return result; - } - result = get_uint32_le_from_buffer(buffer, - &config->checkpoint_frequency); - if (result != UDS_SUCCESS) { - return result; - } - result = get_uint32_le_from_buffer(buffer, - &config->volume_index_mean_delta); - if (result != UDS_SUCCESS) { - return result; - } - result = get_uint32_le_from_buffer(buffer, &config->bytes_per_page); - if (result != UDS_SUCCESS) { - return result; - } - result = - get_uint32_le_from_buffer(buffer, &config->sparse_sample_rate); - if (result != UDS_SUCCESS) { - return result; - } - result = get_uint64_le_from_buffer(buffer, &config->nonce); - if (result != UDS_SUCCESS) { - return result; - } - config->remapped_virtual = 0; - config->remapped_physical = 0; - if (ASSERT_LOG_ONLY(content_length(buffer) == 0, - "%zu bytes decoded of %zu expected", - buffer_length(buffer) - content_length(buffer), - buffer_length(buffer)) != UDS_SUCCESS) { - return UDS_CORRUPT_COMPONENT; - } - return result; -} - -/**********************************************************************/ -static int __must_check -decode_index_config_08_02(struct buffer *buffer, - struct uds_configuration *config) -{ - int result = - get_uint32_le_from_buffer(buffer, - &config->record_pages_per_chapter); - if (result != UDS_SUCCESS) { - return result; - } - result = - get_uint32_le_from_buffer(buffer, - &config->chapters_per_volume); - if (result != UDS_SUCCESS) { - return result; - } - result = - get_uint32_le_from_buffer(buffer, - &config->sparse_chapters_per_volume); - if (result != UDS_SUCCESS) { - return result; - } - result = get_uint32_le_from_buffer(buffer, &config->cache_chapters); - if (result != UDS_SUCCESS) { - return result; - } - result = get_uint32_le_from_buffer(buffer, - &config->checkpoint_frequency); - if (result != UDS_SUCCESS) { - return result; - } - result = get_uint32_le_from_buffer(buffer, - &config->volume_index_mean_delta); - if (result != UDS_SUCCESS) { - return result; - } - result = get_uint32_le_from_buffer(buffer, &config->bytes_per_page); - if (result != UDS_SUCCESS) { - return result; - } - result = - get_uint32_le_from_buffer(buffer, &config->sparse_sample_rate); - if (result != UDS_SUCCESS) { - return result; - } - result = get_uint64_le_from_buffer(buffer, &config->nonce); - if (result != UDS_SUCCESS) { - return result; - } - result = get_uint64_le_from_buffer(buffer, &config->remapped_virtual); - if (result != UDS_SUCCESS) { - return result; - } - result = get_uint64_le_from_buffer(buffer, &config->remapped_physical); - if (result != UDS_SUCCESS) { - return result; - } - if (ASSERT_LOG_ONLY(content_length(buffer) == 0, - "%zu bytes decoded of %zu expected", - buffer_length(buffer) - content_length(buffer), - buffer_length(buffer)) != UDS_SUCCESS) { - return UDS_CORRUPT_COMPONENT; - } - return result; -} - -/**********************************************************************/ -static int read_version(struct buffered_reader *reader, - struct uds_configuration *conf) -{ - byte version_buffer[INDEX_CONFIG_VERSION_LENGTH]; - int result = read_from_buffered_reader(reader, version_buffer, - INDEX_CONFIG_VERSION_LENGTH); - if (result != UDS_SUCCESS) { - return uds_log_error_strerror(result, - "cannot read index config version"); - } - if (memcmp(INDEX_CONFIG_VERSION_6_02, version_buffer, - INDEX_CONFIG_VERSION_LENGTH) == 0) { - struct buffer *buffer; - result = make_buffer(sizeof(struct uds_configuration_6_02), - &buffer); - if (result != UDS_SUCCESS) { - return result; - } - - result = read_from_buffered_reader(reader, - get_buffer_contents(buffer), - buffer_length(buffer)); - if (result != UDS_SUCCESS) { - free_buffer(UDS_FORGET(buffer)); - return uds_log_error_strerror(result, - "cannot read config data"); - } - - clear_buffer(buffer); - result = decode_index_config_06_02(buffer, conf); - free_buffer(UDS_FORGET(buffer)); - } else if (memcmp(INDEX_CONFIG_VERSION_8_02, version_buffer, - INDEX_CONFIG_VERSION_LENGTH) == 0) { - struct buffer *buffer; - result = make_buffer(sizeof(struct uds_configuration), &buffer); - if (result != UDS_SUCCESS) { - return result; - } - result = read_from_buffered_reader(reader, - get_buffer_contents(buffer), - buffer_length(buffer)); - if (result != UDS_SUCCESS) { - free_buffer(UDS_FORGET(buffer)); - return uds_log_error_strerror(result, - "cannot read config data"); - } - clear_buffer(buffer); - result = decode_index_config_08_02(buffer, conf); - free_buffer(UDS_FORGET(buffer)); - } else { - uds_log_error_strerror(result, - "unsupported configuration version: '%.*s'", - INDEX_CONFIG_VERSION_LENGTH, - version_buffer); - result = UDS_CORRUPT_COMPONENT; - } - return result; -} - -/**********************************************************************/ -int read_config_contents(struct buffered_reader *reader, - struct uds_configuration *config) -{ - int result = verify_buffered_data(reader, INDEX_CONFIG_MAGIC, - INDEX_CONFIG_MAGIC_LENGTH); - if (result != UDS_SUCCESS) { - return result; - } - - result = read_version(reader, config); - if (result != UDS_SUCCESS) { - uds_log_error_strerror(result, "Failed to read index config"); - } - return result; -} - -/**********************************************************************/ -static int __must_check -encode_index_config_06_02(struct buffer *buffer, - struct uds_configuration *config) -{ - int result = - put_uint32_le_into_buffer(buffer, - config->record_pages_per_chapter); - if (result != UDS_SUCCESS) { - return result; - } - result = - put_uint32_le_into_buffer(buffer, config->chapters_per_volume); - if (result != UDS_SUCCESS) { - return result; - } - result = put_uint32_le_into_buffer(buffer, - config->sparse_chapters_per_volume); - if (result != UDS_SUCCESS) { - return result; - } - result = put_uint32_le_into_buffer(buffer, config->cache_chapters); - if (result != UDS_SUCCESS) { - return result; - } - result = - put_uint32_le_into_buffer(buffer, - config->checkpoint_frequency); - if (result != UDS_SUCCESS) { - return result; - } - result = put_uint32_le_into_buffer(buffer, - config->volume_index_mean_delta); - if (result != UDS_SUCCESS) { - return result; - } - result = put_uint32_le_into_buffer(buffer, config->bytes_per_page); - if (result != UDS_SUCCESS) { - return result; - } - result = put_uint32_le_into_buffer(buffer, config->sparse_sample_rate); - if (result != UDS_SUCCESS) { - return result; - } - result = put_uint64_le_into_buffer(buffer, config->nonce); - if (result != UDS_SUCCESS) { - return result; - } - return ASSERT_LOG_ONLY(content_length(buffer) == - sizeof(struct uds_configuration_6_02), - "%zu bytes encoded, of %zu expected", - content_length(buffer), - sizeof(*config)); -} - -/**********************************************************************/ -static int __must_check -encode_index_config_08_02(struct buffer *buffer, - struct uds_configuration *config) -{ - int result = - put_uint32_le_into_buffer(buffer, - config->record_pages_per_chapter); - if (result != UDS_SUCCESS) { - return result; - } - result = - put_uint32_le_into_buffer(buffer, config->chapters_per_volume); - if (result != UDS_SUCCESS) { - return result; - } - result = put_uint32_le_into_buffer(buffer, - config->sparse_chapters_per_volume); - if (result != UDS_SUCCESS) { - return result; - } - result = put_uint32_le_into_buffer(buffer, config->cache_chapters); - if (result != UDS_SUCCESS) { - return result; - } - result = - put_uint32_le_into_buffer(buffer, - config->checkpoint_frequency); - if (result != UDS_SUCCESS) { - return result; - } - result = put_uint32_le_into_buffer(buffer, - config->volume_index_mean_delta); - if (result != UDS_SUCCESS) { - return result; - } - result = put_uint32_le_into_buffer(buffer, config->bytes_per_page); - if (result != UDS_SUCCESS) { - return result; - } - result = put_uint32_le_into_buffer(buffer, config->sparse_sample_rate); - if (result != UDS_SUCCESS) { - return result; - } - result = put_uint64_le_into_buffer(buffer, config->nonce); - if (result != UDS_SUCCESS) { - return result; - } - result = put_uint64_le_into_buffer(buffer, config->remapped_virtual); - if (result != UDS_SUCCESS) { - return result; - } - result = put_uint64_le_into_buffer(buffer, - config->remapped_physical); - if (result != UDS_SUCCESS) { - return result; - } - - return ASSERT_LOG_ONLY(content_length(buffer) == - sizeof(struct uds_configuration), - "%zu bytes encoded, of %zu expected", - content_length(buffer), - sizeof(*config)); -} - -/**********************************************************************/ -int write_config_contents(struct buffered_writer *writer, - struct uds_configuration *config, - uint32_t version) -{ - struct buffer *buffer; - int result = write_to_buffered_writer(writer, INDEX_CONFIG_MAGIC, - INDEX_CONFIG_MAGIC_LENGTH); - if (result != UDS_SUCCESS) { - return result; - } - /* - * If version is < 4, the index has not been reduced by a - * chapter so it must be written out as version 6.02 so that - * it is still compatible with older versions of UDS. - */ - if (version < 4) { - result = write_to_buffered_writer(writer, - INDEX_CONFIG_VERSION_6_02, - INDEX_CONFIG_VERSION_LENGTH); - if (result != UDS_SUCCESS) { - return result; - } - result = make_buffer(sizeof(struct uds_configuration_6_02), - &buffer); - if (result != UDS_SUCCESS) { - return result; - } - result = encode_index_config_06_02(buffer, config); - if (result != UDS_SUCCESS) { - free_buffer(UDS_FORGET(buffer)); - return result; - } - } else { - result = write_to_buffered_writer(writer, - INDEX_CONFIG_VERSION_8_02, - INDEX_CONFIG_VERSION_LENGTH); - if (result != UDS_SUCCESS) { - return result; - } - result = make_buffer(sizeof(struct uds_configuration), &buffer); - if (result != UDS_SUCCESS) { - return result; - } - result = encode_index_config_08_02(buffer, config); - if (result != UDS_SUCCESS) { - free_buffer(UDS_FORGET(buffer)); - return result; - } - } - result = write_to_buffered_writer(writer, get_buffer_contents(buffer), - content_length(buffer)); - free_buffer(UDS_FORGET(buffer)); - return result; -} - -/**********************************************************************/ -int make_configuration(const struct uds_configuration *conf, - struct configuration **config_ptr) -{ - struct configuration *config; - int result = UDS_ALLOCATE(1, struct configuration, "configuration", - &config); - if (result != UDS_SUCCESS) { - return result; - } - - result = make_geometry(conf->bytes_per_page, - conf->record_pages_per_chapter, - conf->chapters_per_volume, - conf->sparse_chapters_per_volume, - conf->remapped_virtual, - conf->remapped_physical, - &config->geometry); - if (result != UDS_SUCCESS) { - free_configuration(config); - return result; - } - - config->sparse_sample_rate = conf->sparse_sample_rate; - config->cache_chapters = conf->cache_chapters; - config->volume_index_mean_delta = conf->volume_index_mean_delta; - - *config_ptr = config; - return UDS_SUCCESS; -} - -/**********************************************************************/ -void free_configuration(struct configuration *config) -{ - if (config != NULL) { - free_geometry(config->geometry); - UDS_FREE(config); - } -} diff --git a/uds/indexConfig.h b/uds/indexConfig.h deleted file mode 100644 index 7f170cc1..00000000 --- a/uds/indexConfig.h +++ /dev/null @@ -1,52 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/krusty/src/uds/indexConfig.h#5 $ - */ - -#ifndef INDEX_CONFIG_H -#define INDEX_CONFIG_H 1 - -#include "config.h" -#include "geometry.h" - -/** - * A set of configuration parameters for the indexer. - **/ -struct configuration { - /* Parameters for the volume */ - - /* The volume layout */ - struct geometry *geometry; - - /* - * Size of the page cache and sparse chapter index cache, in - * chapters - */ - unsigned int cache_chapters; - - /** Parameters for the volume index */ - - /* The mean delta for the volume index */ - unsigned int volume_index_mean_delta; - - /* Sampling rate for sparse indexing */ - unsigned int sparse_sample_rate; -}; - -#endif /* INDEX_CONFIG_H */ diff --git a/uds/indexLayout.c b/uds/indexLayout.c deleted file mode 100644 index 6e49a0bb..00000000 --- a/uds/indexLayout.c +++ /dev/null @@ -1,2723 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/krusty/src/uds/indexLayout.c#84 $ - */ - -#include "indexLayout.h" - -#include "buffer.h" -#include "compiler.h" -#include "config.h" -#include "indexConfig.h" -#include "layoutRegion.h" -#include "logger.h" -#include "volumeIndexOps.h" -#include "memoryAlloc.h" -#include "nonce.h" -#include "openChapter.h" - -/* - * Overall layout of an index on disk: - * - * The layout is divided into a number of fixed-size regions, the sizes of - * which are computed when the index is created. Every header and region - * begins on 4K block boundary. Save regions are further sub-divided into - * regions of their own. - * - * Each region has a kind and an instance number. Some kinds only have - * one instance and therefore use RL_SOLE_INSTANCE (-1) as the - * instance number. The RL_KIND_INDEX used to use instances to - * represent sub-indices; now, however there is only ever one - * sub-index and therefore one instance. A save region can either hold - * a checkpoint or a clean shutdown (determined by the type). The - * instances determine which available save slot is used. The - * RL_KIND_VOLUME_INDEX uses instances to record which zone is being - * saved. - * - * +-+-+--------+--------+--------+-----+--- -+-+ - * | | | I N D E X 0 101, 0 | ... | | - * |H|C+--------+--------+--------+-----+--- -+S| - * |D|f| Volume | Save | Save | | |e| - * |R|g| Region | Region | Region | ... | ... |a| - * | | | 201 -1 | 202 0 | 202 1 | | |l| - * +-+-+--------+--------+--------+-----+--- -+-+ - * - * The header contains the encoded region layout table as well as the - * saved index configuration record. The sub-index region and its - * subdivisions are maintained in the same table. - * - * There are at least two save regions to preserve the old state - * should the saving of a state be incomplete. They are used in a - * round-robin fashion. - * - * Anatomy of a save region: - * - * +-+-----+------+------+-----+ -+-----+ - * |H| IPM | MI | MI | | | OC | - * |D| | zone | zone | ... | | | - * |R| 301 | 302 | 302 | | | 303 | - * | | -1 | 0 | 1 | | | -1 | - * +-+-----+------+------+-----+ -+-----+ - * - * Every region header has a type (and version). In save regions, - * the open chapter only appears in RL_TYPE_SAVE not RL_TYPE_CHECKPOINT, - * although the same space is reserved for both. - * - * The header contains the encoded region layout table as well as the - * index state record for that save or checkpoint. Each save or - * checkpoint has a unique generation number and nonce which is used - * to seed the checksums of those regions. - */ - -struct index_save_data { - uint64_t timestamp; // ms since epoch... - uint64_t nonce; - uint32_t version; // 1 - uint32_t unused__; -}; - -struct index_save_layout { - struct layout_region index_save; - struct layout_region header; - unsigned int num_zones; - struct layout_region index_page_map; - struct layout_region free_space; - struct layout_region *volume_index_zones; - struct layout_region *open_chapter; - enum index_save_type save_type; - struct index_save_data save_data; - struct buffer *index_state_buffer; - bool read; - bool written; -}; - -struct sub_index_layout { - struct layout_region sub_index; - uint64_t nonce; - struct layout_region volume; - struct index_save_layout *saves; -}; - -struct super_block_data { - byte magic_label[32]; - byte nonce_info[NONCE_INFO_SIZE]; - uint64_t nonce; - uint32_t version; // 2 or 3 for normal, 7 for converted - uint32_t block_size; // for verification - uint16_t num_indexes; // always 1 - uint16_t max_saves; - byte padding[4]; // pad to 64 bit boundary - uint64_t open_chapter_blocks; - uint64_t page_map_blocks; - uint64_t volume_offset; - uint64_t start_offset; -}; - -struct index_layout { - struct io_factory *factory; - off_t offset; - struct super_block_data super; - struct layout_region header; - struct layout_region config; - struct sub_index_layout index; - struct layout_region seal; - uint64_t total_blocks; - int ref_count; -}; - -/** - * Structure used to compute single file layout sizes. - * - * Note that the volume_index_blocks represent all zones and are sized for - * the maximum number of blocks that would be needed regardless of the number - * of zones (up to the maximum value) that are used at run time. - * - * Similarly, the number of saves is sized for the minimum safe value - * assuming checkpointing is enabled, since that is also a run-time parameter. - **/ -struct save_layout_sizes { - struct configuration config; // this is a captive copy - struct geometry geometry; // this is a captive copy - unsigned int num_saves; // per sub-index - size_t block_size; // in bytes - uint64_t volume_blocks; // per sub-index - uint64_t volume_index_blocks; // per save - uint64_t page_map_blocks; // per save - uint64_t open_chapter_blocks; // per save - uint64_t save_blocks; // per sub-index - uint64_t sub_index_blocks; // per sub-index - uint64_t total_blocks; // for whole layout -}; - -/* - * Version 3 is the normal version used from RHEL8.2 onwards. - * - * Versions 4 through 6 were incremental development versions and are not - * supported. - * - * Version 7 is used for volumes which have been reduced in size by one chapter - * in order to make room to prepend LVM metadata to an existing VDO without - * losing all deduplication. - */ -enum { - SUPER_VERSION_MINIMUM = 3, - SUPER_VERSION_CURRENT = 3, - SUPER_VERSION_MAXIMUM = 7, - INDEX_STATE_BUFFER_SIZE = 512, - MAX_SAVES = 5, -}; - -static const byte SINGLE_FILE_MAGIC_1[32] = "*ALBIREO*SINGLE*FILE*LAYOUT*001*"; -enum { SINGLE_FILE_MAGIC_1_LENGTH = sizeof(SINGLE_FILE_MAGIC_1) }; - -static int __must_check -reconstitute_single_file_layout(struct index_layout *layout, - struct region_table *table, - uint64_t first_block); -static int __must_check -write_index_save_layout(struct index_layout *layout, - struct index_save_layout *isl); - -/**********************************************************************/ -static INLINE bool is_converted_super_block(struct super_block_data *super) -{ - return (super->version == 7); -} - -/**********************************************************************/ -static INLINE uint64_t block_count(uint64_t bytes, uint32_t block_size) -{ - uint64_t blocks = bytes / block_size; - if (bytes % block_size > 0) { - ++blocks; - } - return blocks; -} - -/**********************************************************************/ -static int __must_check compute_sizes(struct save_layout_sizes *sls, - const struct uds_configuration *config, - size_t block_size) -{ - struct configuration *cfg = NULL; - int result; - - if (config->bytes_per_page % block_size != 0) { - return uds_log_error_strerror(UDS_INCORRECT_ALIGNMENT, - "page size not a multiple of block size"); - } - - result = make_configuration(config, &cfg); - if (result != UDS_SUCCESS) { - return uds_log_error_strerror(result, - "cannot compute layout size"); - } - - memset(sls, 0, sizeof(*sls)); - - // internalize the configuration and geometry... - - sls->geometry = *cfg->geometry; - sls->config = *cfg; - sls->config.geometry = &sls->geometry; - - free_configuration(cfg); - - sls->num_saves = 2; - sls->block_size = block_size; - sls->volume_blocks = sls->geometry.bytes_per_volume / block_size; - - result = compute_volume_index_save_blocks(&sls->config, block_size, - &sls->volume_index_blocks); - if (result != UDS_SUCCESS) { - return uds_log_error_strerror(result, - "cannot compute index save size"); - } - - sls->page_map_blocks = - block_count(compute_index_page_map_save_size(&sls->geometry), - block_size); - sls->open_chapter_blocks = - block_count(compute_saved_open_chapter_size(&sls->geometry), - block_size); - sls->save_blocks = - 1 + (sls->volume_index_blocks + sls->page_map_blocks + - sls->open_chapter_blocks); - sls->sub_index_blocks = - sls->volume_blocks + (sls->num_saves * sls->save_blocks); - sls->total_blocks = 3 + sls->sub_index_blocks; - - return UDS_SUCCESS; -} - -/**********************************************************************/ -int uds_compute_index_size(const struct uds_configuration *config, - uint64_t *index_size) -{ - struct save_layout_sizes sizes; - int result; - - if (index_size == NULL) { - uds_log_error("Missing output size pointer"); - return -EINVAL; - } - - result = compute_sizes(&sizes, config, UDS_BLOCK_SIZE); - if (result != UDS_SUCCESS) { - return uds_map_to_system_error(result); - } - - *index_size = sizes.total_blocks * sizes.block_size; - return UDS_SUCCESS; -} - -/**********************************************************************/ -static int __must_check -open_layout_reader(struct index_layout *layout, - struct layout_region *lr, - off_t offset, - struct buffered_reader **reader_ptr) -{ - off_t start = (lr->start_block + offset) * layout->super.block_size; - size_t size = lr->num_blocks * layout->super.block_size; - return open_uds_buffered_reader(layout->factory, start, size, - reader_ptr); -} - -/**********************************************************************/ -static int __must_check -open_layout_writer(struct index_layout *layout, - struct layout_region *lr, - off_t offset, - struct buffered_writer **writer_ptr) -{ - off_t start = (lr->start_block + offset) * layout->super.block_size; - size_t size = lr->num_blocks * layout->super.block_size; - return open_uds_buffered_writer(layout->factory, start, size, - writer_ptr); -} - -/**********************************************************************/ -static int __must_check -decode_index_save_data(struct buffer *buffer, - struct index_save_data *save_data) -{ - int result = get_uint64_le_from_buffer(buffer, &save_data->timestamp); - if (result != UDS_SUCCESS) { - return result; - } - result = get_uint64_le_from_buffer(buffer, &save_data->nonce); - if (result != UDS_SUCCESS) { - return result; - } - result = get_uint32_le_from_buffer(buffer, &save_data->version); - if (result != UDS_SUCCESS) { - return result; - } - result = get_uint32_le_from_buffer(buffer, &save_data->unused__); - if (result != UDS_SUCCESS) { - return result; - } - // The unused padding has to be zeroed for correct nonce calculation - if (save_data->unused__ != 0) { - return UDS_CORRUPT_COMPONENT; - } - result = ASSERT_LOG_ONLY(content_length(buffer) == 0, - "%zu bytes decoded of %zu expected", - buffer_length(buffer), - sizeof(*save_data)); - if (result != UDS_SUCCESS) { - return UDS_CORRUPT_COMPONENT; - } - return result; -} - -/**********************************************************************/ -static int __must_check -decode_region_header(struct buffer *buffer, struct region_header *header) -{ - int result = get_uint64_le_from_buffer(buffer, &header->magic); - if (result != UDS_SUCCESS) { - return result; - } - result = get_uint64_le_from_buffer(buffer, &header->region_blocks); - if (result != UDS_SUCCESS) { - return result; - } - result = get_uint16_le_from_buffer(buffer, &header->type); - if (result != UDS_SUCCESS) { - return result; - } - result = get_uint16_le_from_buffer(buffer, &header->version); - if (result != UDS_SUCCESS) { - return result; - } - result = get_uint16_le_from_buffer(buffer, &header->num_regions); - if (result != UDS_SUCCESS) { - return result; - } - result = get_uint16_le_from_buffer(buffer, &header->payload); - if (result != UDS_SUCCESS) { - return result; - } - result = ASSERT_LOG_ONLY(content_length(buffer) == 0, - "%zu bytes decoded of %zu expected", - buffer_length(buffer), - sizeof(*header)); - if (result != UDS_SUCCESS) { - return UDS_CORRUPT_COMPONENT; - } - return result; -} - -/**********************************************************************/ -static int __must_check -decode_layout_region(struct buffer *buffer, struct layout_region *region) -{ - size_t cl1 = content_length(buffer); - - int result = get_uint64_le_from_buffer(buffer, ®ion->start_block); - if (result != UDS_SUCCESS) { - return result; - } - result = get_uint64_le_from_buffer(buffer, ®ion->num_blocks); - if (result != UDS_SUCCESS) { - return result; - } - result = get_uint32_le_from_buffer(buffer, ®ion->checksum); - if (result != UDS_SUCCESS) { - return result; - } - result = get_uint16_le_from_buffer(buffer, ®ion->kind); - if (result != UDS_SUCCESS) { - return result; - } - result = get_uint16_le_from_buffer(buffer, ®ion->instance); - if (result != UDS_SUCCESS) { - return result; - } - result = ASSERT_LOG_ONLY(cl1 - content_length(buffer) == sizeof(*region), - "%zu bytes decoded, of %zu expected", - cl1 - content_length(buffer), - sizeof(*region)); - if (result != UDS_SUCCESS) { - return UDS_CORRUPT_COMPONENT; - } - return result; -} - -/**********************************************************************/ -static int __must_check load_region_table(struct buffered_reader *reader, - struct region_table **table_ptr) -{ - unsigned int i; - struct region_header header; - struct region_table *table; - struct buffer *buffer; - - int result = make_buffer(sizeof(struct region_header), &buffer); - if (result != UDS_SUCCESS) { - return result; - } - - result = read_from_buffered_reader(reader, - get_buffer_contents(buffer), - buffer_length(buffer)); - if (result != UDS_SUCCESS) { - free_buffer(UDS_FORGET(buffer)); - return uds_log_error_strerror(result, - "cannot read region table header"); - } - - result = reset_buffer_end(buffer, buffer_length(buffer)); - if (result != UDS_SUCCESS) { - free_buffer(UDS_FORGET(buffer)); - return result; - } - - result = decode_region_header(buffer, &header); - free_buffer(UDS_FORGET(buffer)); - if (result != UDS_SUCCESS) { - return result; - } - - if (header.magic != REGION_MAGIC) { - return UDS_NO_INDEX; - } - - if (header.version != 1) { - return uds_log_error_strerror(UDS_UNSUPPORTED_VERSION, - "unknown region table version %hu", - header.version); - } - - result = UDS_ALLOCATE_EXTENDED(struct region_table, - header.num_regions, - struct layout_region, - "single file layout region table", - &table); - if (result != UDS_SUCCESS) { - return result; - } - - table->header = header; - result = make_buffer(header.num_regions * sizeof(struct layout_region), - &buffer); - if (result != UDS_SUCCESS) { - UDS_FREE(table); - return result; - } - - result = read_from_buffered_reader(reader, - get_buffer_contents(buffer), - buffer_length(buffer)); - if (result != UDS_SUCCESS) { - UDS_FREE(table); - free_buffer(UDS_FORGET(buffer)); - return uds_log_error_strerror(UDS_CORRUPT_COMPONENT, - "cannot read region table layouts"); - } - - result = reset_buffer_end(buffer, buffer_length(buffer)); - if (result != UDS_SUCCESS) { - UDS_FREE(table); - free_buffer(UDS_FORGET(buffer)); - return result; - } - - for (i = 0; i < header.num_regions; i++) { - result = decode_layout_region(buffer, &table->regions[i]); - if (result != UDS_SUCCESS) { - UDS_FREE(table); - free_buffer(UDS_FORGET(buffer)); - return result; - } - } - - free_buffer(UDS_FORGET(buffer)); - *table_ptr = table; - return UDS_SUCCESS; -} - -/**********************************************************************/ -static int __must_check -decode_super_block_data(struct buffer *buffer, struct super_block_data *super) -{ - int result = get_bytes_from_buffer(buffer, 32, super->magic_label); - if (result != UDS_SUCCESS) { - return result; - } - result = get_bytes_from_buffer(buffer, NONCE_INFO_SIZE, - super->nonce_info); - if (result != UDS_SUCCESS) { - return result; - } - result = get_uint64_le_from_buffer(buffer, &super->nonce); - if (result != UDS_SUCCESS) { - return result; - } - result = get_uint32_le_from_buffer(buffer, &super->version); - if (result != UDS_SUCCESS) { - return result; - } - result = get_uint32_le_from_buffer(buffer, &super->block_size); - if (result != UDS_SUCCESS) { - return result; - } - result = get_uint16_le_from_buffer(buffer, &super->num_indexes); - if (result != UDS_SUCCESS) { - return result; - } - result = get_uint16_le_from_buffer(buffer, &super->max_saves); - if (result != UDS_SUCCESS) { - return result; - } - result = skip_forward(buffer, 4); // aligment - if (result != UDS_SUCCESS) { - return result; - } - result = get_uint64_le_from_buffer(buffer, &super->open_chapter_blocks); - if (result != UDS_SUCCESS) { - return result; - } - result = get_uint64_le_from_buffer(buffer, &super->page_map_blocks); - if (result != UDS_SUCCESS) { - return result; - } - if (is_converted_super_block(super)) { - result = get_uint64_le_from_buffer(buffer, - &super->volume_offset); - if (result != UDS_SUCCESS) { - return result; - } - result = get_uint64_le_from_buffer(buffer, - &super->start_offset); - if (result != UDS_SUCCESS) { - return result; - } - } else { - super->volume_offset = 0; - super->start_offset = 0; - } - result = ASSERT_LOG_ONLY(content_length(buffer) == 0, - "%zu bytes decoded of %zu expected", - buffer_length(buffer), - sizeof(*super)); - if (result != UDS_SUCCESS) { - return UDS_CORRUPT_COMPONENT; - } - return result; -} - -/**********************************************************************/ -static int __must_check read_super_block_data(struct buffered_reader *reader, - struct index_layout *layout, - size_t saved_size) -{ - struct super_block_data *super = &layout->super; - struct buffer *buffer; - int result; - - if (sizeof(super->magic_label) != SINGLE_FILE_MAGIC_1_LENGTH) { - return uds_log_error_strerror(UDS_CORRUPT_COMPONENT, - "super block magic label size incorrect"); - } - - result = make_buffer(saved_size, &buffer); - if (result != UDS_SUCCESS) { - return result; - } - - result = read_from_buffered_reader(reader, - get_buffer_contents(buffer), - buffer_length(buffer)); - if (result != UDS_SUCCESS) { - free_buffer(UDS_FORGET(buffer)); - return uds_log_error_strerror(result, - "cannot read region table header"); - } - - result = reset_buffer_end(buffer, buffer_length(buffer)); - if (result != UDS_SUCCESS) { - free_buffer(UDS_FORGET(buffer)); - return result; - } - - result = decode_super_block_data(buffer, super); - free_buffer(UDS_FORGET(buffer)); - if (result != UDS_SUCCESS) { - return uds_log_error_strerror(result, - "cannot read super block data"); - } - - if (memcmp(super->magic_label, - SINGLE_FILE_MAGIC_1, - SINGLE_FILE_MAGIC_1_LENGTH) != 0) { - return uds_log_error_strerror(UDS_CORRUPT_COMPONENT, - "unknown superblock magic label"); - } - - if ((super->version < SUPER_VERSION_MINIMUM) || - (super->version == 4) || - (super->version == 5) || - (super->version == 6) || - (super->version > SUPER_VERSION_MAXIMUM)) { - return uds_log_error_strerror(UDS_UNSUPPORTED_VERSION, - "unknown superblock version number %u", - super->version); - } - - if (super->volume_offset < super->start_offset) { - return uds_log_error_strerror(UDS_CORRUPT_COMPONENT, - "inconsistent offsets (start %llu, volume %llu)", - (unsigned long long) super->start_offset, - (unsigned long long) super->volume_offset); - } - - // We dropped the usage of multiple subindices before we ever ran UDS - // code in the kernel. We do not have code that will handle multiple - // subindices. - if (super->num_indexes != 1) { - return uds_log_error_strerror(UDS_CORRUPT_COMPONENT, - "invalid subindex count %u", - super->num_indexes); - } - - if (generate_primary_nonce(super->nonce_info, - sizeof(super->nonce_info)) != super->nonce) { - return uds_log_error_strerror(UDS_CORRUPT_COMPONENT, - "inconsistent superblock nonce"); - } - - return UDS_SUCCESS; -} - -/**********************************************************************/ -static int __must_check -allocate_single_file_parts(struct index_layout *layout) -{ - int result = UDS_ALLOCATE(layout->super.max_saves, - struct index_save_layout, - __func__, - &layout->index.saves); - if (result != UDS_SUCCESS) { - return result; - } - - return UDS_SUCCESS; -} - -/**********************************************************************/ -static int __must_check -load_super_block(struct index_layout *layout, - size_t block_size, - uint64_t first_block, - struct buffered_reader *reader) -{ - struct region_table *table = NULL; - struct super_block_data *super = &layout->super; - int result = load_region_table(reader, &table); - if (result != UDS_SUCCESS) { - return result; - } - - if (table->header.type != RH_TYPE_SUPER) { - UDS_FREE(table); - return uds_log_error_strerror(UDS_CORRUPT_COMPONENT, - "not a superblock region table"); - } - - result = read_super_block_data(reader, layout, table->header.payload); - if (result != UDS_SUCCESS) { - UDS_FREE(table); - return uds_log_error_strerror(result, - "unknown superblock format"); - } - - if (super->block_size != block_size) { - UDS_FREE(table); - return uds_log_error_strerror(UDS_CORRUPT_COMPONENT, - "superblock saved block_size %u differs from supplied block_size %zu", - super->block_size, - block_size); - } - - result = allocate_single_file_parts(layout); - if (result != UDS_SUCCESS) { - UDS_FREE(table); - return result; - } - - first_block -= (super->volume_offset - super->start_offset); - result = reconstitute_single_file_layout(layout, table, first_block); - UDS_FREE(table); - return result; -} - -/**********************************************************************/ -static int __must_check -read_index_save_data(struct buffered_reader *reader, - struct index_save_data *save_data, - size_t saved_size, - struct buffer **buffer_ptr) -{ - struct buffer *buffer = NULL; - int result = UDS_SUCCESS; - - if (saved_size == 0) { - memset(save_data, 0, sizeof(*save_data)); - } else { - if (saved_size < sizeof(struct index_save_data)) { - return uds_log_error_strerror(UDS_CORRUPT_COMPONENT, - "unexpected index save data size %zu", - saved_size); - } - - result = make_buffer(sizeof(*save_data), &buffer); - if (result != UDS_SUCCESS) { - return result; - } - - result = read_from_buffered_reader(reader, - get_buffer_contents(buffer), - buffer_length(buffer)); - if (result != UDS_SUCCESS) { - free_buffer(UDS_FORGET(buffer)); - return uds_log_error_strerror(result, - "cannot read index save data"); - } - - result = reset_buffer_end(buffer, buffer_length(buffer)); - if (result != UDS_SUCCESS) { - free_buffer(UDS_FORGET(buffer)); - return result; - } - - result = decode_index_save_data(buffer, save_data); - free_buffer(UDS_FORGET(buffer)); - if (result != UDS_SUCCESS) { - return result; - } - - saved_size -= sizeof(struct index_save_data); - - if (save_data->version > 1) { - return uds_log_error_strerror(UDS_UNSUPPORTED_VERSION, - "unknown index save version number %u", - save_data->version); - } - - if (saved_size > INDEX_STATE_BUFFER_SIZE) { - return uds_log_error_strerror(UDS_CORRUPT_COMPONENT, - "unexpected index state buffer size %zu", - saved_size); - } - } - - if (save_data->version != 0) { - result = make_buffer(INDEX_STATE_BUFFER_SIZE, &buffer); - if (result != UDS_SUCCESS) { - return result; - } - - if (saved_size > 0) { - result = read_from_buffered_reader(reader, - get_buffer_contents(buffer), - saved_size); - if (result != UDS_SUCCESS) { - free_buffer(UDS_FORGET(buffer)); - return result; - } - - result = reset_buffer_end(buffer, saved_size); - if (result != UDS_SUCCESS) { - free_buffer(UDS_FORGET(buffer)); - return result; - } - } - } - - *buffer_ptr = buffer; - return UDS_SUCCESS; -} - -/**********************************************************************/ - -struct region_iterator { - struct layout_region *next_region; - struct layout_region *last_region; - uint64_t next_block; - int result; -}; - -/**********************************************************************/ -__printf(2, 3) -static void iter_error(struct region_iterator *iter, const char *fmt, ...) -{ - int r; - va_list args; - va_start(args, fmt); - r = uds_vlog_strerror(UDS_LOG_ERR, UDS_UNEXPECTED_RESULT, NULL, fmt, args); - va_end(args); - if (iter->result == UDS_SUCCESS) { - iter->result = r; - } -} - -/** - * Set the next layout region in the layout according to a region table - * iterator, unless the iterator already contains an error - * - * @param expect whether to record an error or return false - * @param lr the layout region field to set - * @param iter the region iterator, which also holds the cumulative - * result - * @param num_blocks if non-zero, the expected number of blocks - * @param kind the expected kind of the region - * @param instance the expected instance number of the region - * - * @return true if we meet expectations, false if we do not - **/ -static bool expect_layout(bool expect, - struct layout_region *lr, - struct region_iterator *iter, - uint64_t num_blocks, - enum region_kind kind, - unsigned int instance) -{ - if (iter->result != UDS_SUCCESS) { - return false; - } - - if (iter->next_region == iter->last_region) { - if (expect) { - iter_error(iter, - "ran out of layout regions in region table"); - } - return false; - } - - if (iter->next_region->start_block != iter->next_block) { - iter_error(iter, "layout region not at expected offset"); - return false; - } - - if (iter->next_region->kind != kind) { - if (expect) { - iter_error(iter, "layout region has incorrect kind"); - } - return false; - } - - if (iter->next_region->instance != instance) { - iter_error(iter, "layout region has incorrect instance"); - return false; - } - - if (num_blocks > 0 && iter->next_region->num_blocks != num_blocks) { - iter_error(iter, "layout region size is incorrect"); - return false; - } - - if (lr != NULL) { - *lr = *iter->next_region; - } - - iter->next_block += iter->next_region->num_blocks; - iter->next_region++; - return true; -} - -/**********************************************************************/ -static void setup_layout(struct layout_region *lr, - uint64_t *next_addr_ptr, - uint64_t region_size, - unsigned int kind, - unsigned int instance) -{ - *lr = (struct layout_region){ - .start_block = *next_addr_ptr, - .num_blocks = region_size, - .checksum = 0, - .kind = kind, - .instance = instance, - }; - *next_addr_ptr += region_size; -} - -/**********************************************************************/ -static void populate_index_save_layout(struct index_save_layout *isl, - struct super_block_data *super, - unsigned int num_zones, - enum index_save_type save_type) -{ - uint64_t blocks_avail; - uint64_t next_block = isl->index_save.start_block; - - setup_layout(&isl->header, &next_block, 1, RL_KIND_HEADER, - RL_SOLE_INSTANCE); - setup_layout(&isl->index_page_map, - &next_block, - super->page_map_blocks, - RL_KIND_INDEX_PAGE_MAP, - RL_SOLE_INSTANCE); - - blocks_avail = (isl->index_save.num_blocks - - (next_block - isl->index_save.start_block) - - super->open_chapter_blocks); - - if (num_zones > 0) { - uint64_t mi_block_count = blocks_avail / num_zones; - unsigned int z; - for (z = 0; z < num_zones; ++z) { - struct layout_region *miz = &isl->volume_index_zones[z]; - setup_layout(miz, - &next_block, - mi_block_count, - RL_KIND_VOLUME_INDEX, - z); - } - } - if (save_type == IS_SAVE && isl->open_chapter != NULL) { - setup_layout(isl->open_chapter, - &next_block, - super->open_chapter_blocks, - RL_KIND_OPEN_CHAPTER, - RL_SOLE_INSTANCE); - } - setup_layout(&isl->free_space, - &next_block, - (isl->index_save.num_blocks - - (next_block - isl->index_save.start_block)), - RL_KIND_SCRATCH, - RL_SOLE_INSTANCE); -} - -/**********************************************************************/ -static int __must_check -reconstruct_index_save(struct index_save_layout *isl, - struct index_save_data *save_data, - struct super_block_data *super, - struct region_table *table) -{ - int result = UDS_SUCCESS; - unsigned int z, n = 0; - struct region_iterator iter, tmp_iter; - - isl->num_zones = 0; - isl->save_data = *save_data; - isl->read = false; - isl->written = false; - - if (table->header.type == RH_TYPE_SAVE) { - isl->save_type = IS_SAVE; - } else if (table->header.type == RH_TYPE_CHECKPOINT) { - isl->save_type = IS_CHECKPOINT; - } else { - isl->save_type = NO_SAVE; - } - - if ((table->header.num_regions == 0) || - ((table->header.num_regions == 1) && - (table->regions[0].kind == RL_KIND_SCRATCH))) { - populate_index_save_layout(isl, super, 0, NO_SAVE); - return UDS_SUCCESS; - } - - iter = (struct region_iterator) { - .next_region = table->regions, - .last_region = table->regions + table->header.num_regions, - .next_block = isl->index_save.start_block, - .result = UDS_SUCCESS, - }; - - expect_layout(true, - &isl->header, - &iter, - 1, - RL_KIND_HEADER, - RL_SOLE_INSTANCE); - expect_layout(true, - &isl->index_page_map, - &iter, - 0, - RL_KIND_INDEX_PAGE_MAP, - RL_SOLE_INSTANCE); - for (tmp_iter = iter; - expect_layout(false, NULL, &tmp_iter, 0, RL_KIND_VOLUME_INDEX, n); - ++n) - ; - isl->num_zones = n; - - if (isl->num_zones > 0) { - result = UDS_ALLOCATE(n, - struct layout_region, - "volume index layout regions", - &isl->volume_index_zones); - if (result != UDS_SUCCESS) { - return result; - } - } - - if (isl->save_type == IS_SAVE) { - result = UDS_ALLOCATE(1, - struct layout_region, - "open chapter layout region", - &isl->open_chapter); - if (result != UDS_SUCCESS) { - UDS_FREE(isl->volume_index_zones); - return result; - } - } - - for (z = 0; z < isl->num_zones; ++z) { - expect_layout(true, - &isl->volume_index_zones[z], - &iter, - 0, - RL_KIND_VOLUME_INDEX, - z); - } - if (isl->save_type == IS_SAVE) { - expect_layout(true, - isl->open_chapter, - &iter, - 0, - RL_KIND_OPEN_CHAPTER, - RL_SOLE_INSTANCE); - } - if (!expect_layout(false, - &isl->free_space, - &iter, - 0, - RL_KIND_SCRATCH, - RL_SOLE_INSTANCE)) { - isl->free_space = (struct layout_region){ - .start_block = iter.next_block, - .num_blocks = (isl->index_save.start_block + - isl->index_save.num_blocks) - - iter.next_block, - .checksum = 0, - .kind = RL_KIND_SCRATCH, - .instance = RL_SOLE_INSTANCE, - }; - iter.next_block = isl->free_space.start_block + - isl->free_space.num_blocks; - } - - if (iter.result != UDS_SUCCESS) { - return iter.result; - } - if (iter.next_region != iter.last_region) { - return uds_log_error_strerror(UDS_UNEXPECTED_RESULT, - "expected %ld additional regions", - iter.last_region - iter.next_region); - } - if (iter.next_block != - isl->index_save.start_block + isl->index_save.num_blocks) { - return uds_log_error_strerror(UDS_UNEXPECTED_RESULT, - "index save layout table incomplete"); - } - - return UDS_SUCCESS; -} - -/**********************************************************************/ -static int __must_check load_index_save(struct index_save_layout *isl, - struct super_block_data *super, - struct buffered_reader *reader, - unsigned int save_id) -{ - struct index_save_data index_data; - struct region_table *table = NULL; - int result = load_region_table(reader, &table); - if (result != UDS_SUCCESS) { - return uds_log_error_strerror(result, - "cannot read index 0 save %u header", - save_id); - } - - if (table->header.region_blocks != isl->index_save.num_blocks) { - uint64_t region_blocks = table->header.region_blocks; - UDS_FREE(table); - return uds_log_error_strerror(UDS_CORRUPT_COMPONENT, - "unexpected index 0 save %u " - "region block count %llu", - save_id, - (unsigned long long) region_blocks); - } - - if (table->header.type != RH_TYPE_SAVE && - table->header.type != RH_TYPE_CHECKPOINT && - table->header.type != RH_TYPE_UNSAVED) { - unsigned int type = table->header.type; - UDS_FREE(table); - return uds_log_error_strerror(UDS_CORRUPT_COMPONENT, - "unexpected index 0 save %u header type %u", - save_id, - type); - } - - result = read_index_save_data(reader, - &index_data, - table->header.payload, - &isl->index_state_buffer); - if (result != UDS_SUCCESS) { - UDS_FREE(table); - return uds_log_error_strerror(result, - "unknown index 0 save %u data format", - save_id); - } - - result = reconstruct_index_save(isl, &index_data, super, table); - UDS_FREE(table); - - if (result != UDS_SUCCESS) { - free_buffer(UDS_FORGET(isl->index_state_buffer)); - return uds_log_error_strerror(result, - "cannot reconstruct index 0 save %u", - save_id); - } - - isl->read = true; - return UDS_SUCCESS; -} - -/**********************************************************************/ -static int __must_check load_sub_index_regions(struct index_layout *layout) -{ - unsigned int j; - for (j = 0; j < layout->super.max_saves; ++j) { - struct index_save_layout *isl = &layout->index.saves[j]; - - struct buffered_reader *reader; - int result = - open_layout_reader(layout, - &isl->index_save, - -layout->super.start_offset, - &reader); - - if (result != UDS_SUCCESS) { - uds_log_error_strerror(result, - "cannot get reader for index 0 save %u", - j); - while (j-- > 0) { - struct index_save_layout *isl = - &layout->index.saves[j]; - UDS_FREE(isl->volume_index_zones); - UDS_FREE(isl->open_chapter); - free_buffer(UDS_FORGET(isl->index_state_buffer)); - } - - return result; - } - - result = load_index_save(isl, &layout->super, reader, j); - free_buffered_reader(reader); - if (result != UDS_SUCCESS) { - while (j-- > 0) { - struct index_save_layout *isl = - &layout->index.saves[j]; - UDS_FREE(isl->volume_index_zones); - UDS_FREE(isl->open_chapter); - free_buffer(UDS_FORGET(isl->index_state_buffer)); - } - return result; - } - } - - return UDS_SUCCESS; -} - -/**********************************************************************/ -static int load_index_layout(struct index_layout *layout) -{ - struct buffered_reader *reader; - int result = open_uds_buffered_reader(layout->factory, layout->offset, - UDS_BLOCK_SIZE, &reader); - if (result != UDS_SUCCESS) { - return uds_log_error_strerror(result, - "unable to read superblock"); - } - - result = load_super_block(layout, - UDS_BLOCK_SIZE, - layout->offset / UDS_BLOCK_SIZE, - reader); - free_buffered_reader(reader); - if (result != UDS_SUCCESS) { - UDS_FREE(layout->index.saves); - layout->index.saves = NULL; - return result; - } - - result = load_sub_index_regions(layout); - if (result != UDS_SUCCESS) { - UDS_FREE(layout->index.saves); - layout->index.saves = NULL; - return result; - } - return UDS_SUCCESS; -} - -/**********************************************************************/ -static void generate_super_block_data(size_t block_size, - unsigned int max_saves, - uint64_t open_chapter_blocks, - uint64_t page_map_blocks, - struct super_block_data *super) -{ - memset(super, 0, sizeof(*super)); - memcpy(super->magic_label, - SINGLE_FILE_MAGIC_1, - SINGLE_FILE_MAGIC_1_LENGTH); - create_unique_nonce_data(super->nonce_info); - - super->nonce = generate_primary_nonce(super->nonce_info, - sizeof(super->nonce_info)); - super->version = SUPER_VERSION_CURRENT; - super->block_size = block_size; - super->num_indexes = 1; - super->max_saves = max_saves; - super->open_chapter_blocks = open_chapter_blocks; - super->page_map_blocks = page_map_blocks; - super->volume_offset = 0; - super->start_offset = 0; -} - -/**********************************************************************/ -static int __must_check -reset_index_save_layout(struct index_save_layout *isl, - uint64_t *next_block_ptr, - uint64_t save_blocks, - uint64_t page_map_blocks, - unsigned int instance) -{ - uint64_t remaining; - uint64_t start_block = *next_block_ptr; - - if (isl->volume_index_zones) { - UDS_FREE(isl->volume_index_zones); - } - - if (isl->open_chapter) { - UDS_FREE(isl->open_chapter); - } - - if (isl->index_state_buffer) { - free_buffer(UDS_FORGET(isl->index_state_buffer)); - } - - memset(isl, 0, sizeof(*isl)); - isl->save_type = NO_SAVE; - setup_layout(&isl->index_save, - &start_block, - save_blocks, - RL_KIND_SAVE, - instance); - setup_layout(&isl->header, - next_block_ptr, - 1, - RL_KIND_HEADER, - RL_SOLE_INSTANCE); - setup_layout(&isl->index_page_map, - next_block_ptr, - page_map_blocks, - RL_KIND_INDEX_PAGE_MAP, - RL_SOLE_INSTANCE); - remaining = start_block - *next_block_ptr; - setup_layout(&isl->free_space, - next_block_ptr, - remaining, - RL_KIND_SCRATCH, - RL_SOLE_INSTANCE); - // number of zones is a save-time parameter - // presence of open chapter is a save-time parameter - return UDS_SUCCESS; -} - -/**********************************************************************/ -static void define_sub_index_nonce(struct index_layout *layout, - unsigned int index_id) -{ - struct sub_index_nonce_data { - uint64_t offset; - uint16_t index_id; - }; - - struct sub_index_layout *sil = &layout->index; - uint64_t primary_nonce = layout->super.nonce; - byte buffer[sizeof(struct sub_index_nonce_data)] = { 0 }; - size_t offset = 0; - encode_uint64_le(buffer, &offset, sil->sub_index.start_block); - encode_uint16_le(buffer, &offset, index_id); - sil->nonce = generate_secondary_nonce(primary_nonce, - buffer, - sizeof(buffer)); - if (sil->nonce == 0) { - sil->nonce = generate_secondary_nonce(~primary_nonce + 1, - buffer, - sizeof(buffer)); - } -} - -/**********************************************************************/ -static int __must_check setup_sub_index(struct index_layout *layout, - uint64_t *next_block_ptr, - struct save_layout_sizes *sls, - unsigned int instance) -{ - struct sub_index_layout *sil = &layout->index; - uint64_t start_block = *next_block_ptr; - unsigned int i; - - setup_layout(&sil->sub_index, - &start_block, - sls->sub_index_blocks, - RL_KIND_INDEX, - instance); - setup_layout(&sil->volume, - next_block_ptr, - sls->volume_blocks, - RL_KIND_VOLUME, - RL_SOLE_INSTANCE); - for (i = 0; i < sls->num_saves; ++i) { - int result = reset_index_save_layout(&sil->saves[i], - next_block_ptr, - sls->save_blocks, - sls->page_map_blocks, - i); - if (result != UDS_SUCCESS) { - return result; - } - } - - if (start_block != *next_block_ptr) { - return uds_log_error_strerror(UDS_UNEXPECTED_RESULT, - "sub index layout regions don't agree"); - } - - define_sub_index_nonce(layout, instance); - return UDS_SUCCESS; -} - -/**********************************************************************/ -/** - * Initialize a single file layout using the save layout sizes specified. - * - * @param layout the layout to initialize - * @param offset the offset in bytes from the start of the backing storage - * @param size the size in bytes of the backing storage - * @param sls a populated struct save_layout_sizes object - * - * @return UDS_SUCCESS or an error code - **/ -static int __must_check -init_single_file_layout(struct index_layout *layout, - uint64_t offset, - uint64_t size, - struct save_layout_sizes *sls) -{ - uint64_t next_block; - int result; - - layout->total_blocks = sls->total_blocks; - - if (size < sls->total_blocks * sls->block_size) { - uds_log_error("not enough space for index as configured"); - return -ENOSPC; - } - - generate_super_block_data(sls->block_size, - sls->num_saves, - sls->open_chapter_blocks, - sls->page_map_blocks, - &layout->super); - - result = allocate_single_file_parts(layout); - if (result != UDS_SUCCESS) { - return result; - } - - next_block = offset / sls->block_size; - - setup_layout(&layout->header, - &next_block, - 1, - RL_KIND_HEADER, - RL_SOLE_INSTANCE); - setup_layout(&layout->config, - &next_block, - 1, - RL_KIND_CONFIG, - RL_SOLE_INSTANCE); - result = setup_sub_index(layout, &next_block, sls, 0); - - if (result != UDS_SUCCESS) { - return result; - } - setup_layout(&layout->seal, &next_block, 1, RL_KIND_SEAL, - RL_SOLE_INSTANCE); - if (next_block * sls->block_size > offset + size) { - return uds_log_error_strerror(UDS_UNEXPECTED_RESULT, - "layout does not fit as expected"); - } - return UDS_SUCCESS; -} - -/**********************************************************************/ -static void expect_sub_index(struct index_layout *layout, - struct region_iterator *iter, - unsigned int instance) -{ - unsigned int i; - struct sub_index_layout *sil = &layout->index; - uint64_t start_block = iter->next_block; - uint64_t end_block; - if (iter->result != UDS_SUCCESS) { - return; - } - - expect_layout(true, &sil->sub_index, iter, 0, RL_KIND_INDEX, instance); - - end_block = iter->next_block; - iter->next_block = start_block; - - expect_layout(true, &sil->volume, iter, 0, RL_KIND_VOLUME, - RL_SOLE_INSTANCE); - - iter->next_block += layout->super.volume_offset; - end_block += layout->super.volume_offset; - - for (i = 0; i < layout->super.max_saves; ++i) { - struct index_save_layout *isl = &sil->saves[i]; - expect_layout(true, &isl->index_save, iter, 0, RL_KIND_SAVE, i); - } - - if (iter->next_block != end_block) { - iter_error(iter, "sub index region does not span all saves"); - } - - define_sub_index_nonce(layout, instance); -} - -/**********************************************************************/ - -/** - * Initialize a single file layout from the region table and super block data - * stored in stable storage. - * - * @param layout the layout to initialize - * @param table the region table read from the superblock - * @param first_block the first block number in the region - * - * @return UDS_SUCCESS or an error code - **/ -static int __must_check -reconstitute_single_file_layout(struct index_layout *layout, - struct region_table *table, - uint64_t first_block) -{ - struct region_iterator iter = { - .next_region = table->regions, - .last_region = table->regions + table->header.num_regions, - .next_block = first_block, - .result = UDS_SUCCESS - }; - - layout->total_blocks = table->header.region_blocks; - - expect_layout(true, - &layout->header, - &iter, - 1, - RL_KIND_HEADER, - RL_SOLE_INSTANCE); - expect_layout(true, - &layout->config, - &iter, - 1, - RL_KIND_CONFIG, - RL_SOLE_INSTANCE); - expect_sub_index(layout, &iter, 0); - expect_layout(true, &layout->seal, &iter, 1, RL_KIND_SEAL, - RL_SOLE_INSTANCE); - - if (iter.result != UDS_SUCCESS) { - return iter.result; - } - - if ((iter.next_block - layout->super.volume_offset) != - (first_block + layout->total_blocks)) { - return uds_log_error_strerror(UDS_UNEXPECTED_RESULT, - "layout table does not span total blocks"); - } - return UDS_SUCCESS; -} - -/**********************************************************************/ -static int __must_check save_sub_index_regions(struct index_layout *layout) -{ - struct sub_index_layout *sil = &layout->index; - unsigned int j; - for (j = 0; j < layout->super.max_saves; ++j) { - struct index_save_layout *isl = &sil->saves[j]; - int result = write_index_save_layout(layout, isl); - if (result != UDS_SUCCESS) { - return uds_log_error_strerror(result, - "unable to format index %u save 0 layout", - j); - } - } - return UDS_SUCCESS; -} - -/**********************************************************************/ -static int __must_check -make_single_file_region_table(struct index_layout *layout, - unsigned int *num_regions_ptr, - struct region_table **table_ptr) -{ - unsigned int num_regions = 1 + // header - 1 + // config - 1 + // index - 1 + // volume - layout->super.max_saves + // saves - 1; // seal - - struct region_table *table; - struct sub_index_layout *sil; - unsigned int j; - struct layout_region *lr; - int result = UDS_ALLOCATE_EXTENDED(struct region_table, - num_regions, - struct layout_region, - "layout region table", - &table); - if (result != UDS_SUCCESS) { - return result; - } - - lr = &table->regions[0]; - *lr++ = layout->header; - *lr++ = layout->config; - sil = &layout->index; - *lr++ = sil->sub_index; - *lr++ = sil->volume; - - for (j = 0; j < layout->super.max_saves; ++j) { - *lr++ = sil->saves[j].index_save; - } - *lr++ = layout->seal; - - result = ASSERT((lr == &table->regions[num_regions]), - "incorrect number of regions"); - if (result != UDS_SUCCESS) { - return result; - } - - *num_regions_ptr = num_regions; - *table_ptr = table; - return UDS_SUCCESS; -} - -/**********************************************************************/ -static int __must_check -encode_index_save_data(struct buffer *buffer, - struct index_save_data *save_data) -{ - int result = put_uint64_le_into_buffer(buffer, save_data->timestamp); - if (result != UDS_SUCCESS) { - return result; - } - result = put_uint64_le_into_buffer(buffer, save_data->nonce); - if (result != UDS_SUCCESS) { - return result; - } - result = put_uint32_le_into_buffer(buffer, save_data->version); - if (result != UDS_SUCCESS) { - return result; - } - result = zero_bytes(buffer, 4); /* padding */ - if (result != UDS_SUCCESS) { - return result; - } - result = ASSERT_LOG_ONLY(content_length(buffer) == sizeof *save_data, - "%zu bytes encoded of %zu expected", - content_length(buffer), - sizeof(*save_data)); - return result; -} - -/**********************************************************************/ -static int __must_check -encode_region_header(struct buffer *buffer, struct region_header *header) -{ - size_t starting_length = content_length(buffer); - int result = put_uint64_le_into_buffer(buffer, REGION_MAGIC); - if (result != UDS_SUCCESS) { - return result; - } - result = put_uint64_le_into_buffer(buffer, header->region_blocks); - if (result != UDS_SUCCESS) { - return result; - } - result = put_uint16_le_into_buffer(buffer, header->type); - if (result != UDS_SUCCESS) { - return result; - } - result = put_uint16_le_into_buffer(buffer, header->version); - if (result != UDS_SUCCESS) { - return result; - } - result = put_uint16_le_into_buffer(buffer, header->num_regions); - if (result != UDS_SUCCESS) { - return result; - } - result = put_uint16_le_into_buffer(buffer, header->payload); - if (result != UDS_SUCCESS) { - return result; - } - result = ASSERT_LOG_ONLY(content_length(buffer) - starting_length == - sizeof(*header), - "%zu bytes encoded, of %zu expected", - content_length(buffer) - starting_length, - sizeof(*header)); - return result; -} - -/**********************************************************************/ -static int __must_check -encode_layout_region(struct buffer *buffer, struct layout_region *region) -{ - size_t starting_length = content_length(buffer); - int result = put_uint64_le_into_buffer(buffer, region->start_block); - if (result != UDS_SUCCESS) { - return result; - } - result = put_uint64_le_into_buffer(buffer, region->num_blocks); - if (result != UDS_SUCCESS) { - return result; - } - result = put_uint32_le_into_buffer(buffer, region->checksum); - if (result != UDS_SUCCESS) { - return result; - } - result = put_uint16_le_into_buffer(buffer, region->kind); - if (result != UDS_SUCCESS) { - return result; - } - result = put_uint16_le_into_buffer(buffer, region->instance); - if (result != UDS_SUCCESS) { - return result; - } - result = ASSERT_LOG_ONLY(content_length(buffer) - starting_length == - sizeof(*region), - "%zu bytes encoded, of %zu expected", - content_length(buffer) - starting_length, - sizeof(*region)); - return result; -} - -/**********************************************************************/ -static int __must_check encode_super_block_data(struct buffer *buffer, - struct super_block_data *super) -{ - int result = put_bytes(buffer, 32, &super->magic_label); - if (result != UDS_SUCCESS) { - return result; - } - result = put_bytes(buffer, NONCE_INFO_SIZE, &super->nonce_info); - if (result != UDS_SUCCESS) { - return result; - } - result = put_uint64_le_into_buffer(buffer, super->nonce); - if (result != UDS_SUCCESS) { - return result; - } - result = put_uint32_le_into_buffer(buffer, super->version); - if (result != UDS_SUCCESS) { - return result; - } - result = put_uint32_le_into_buffer(buffer, super->block_size); - if (result != UDS_SUCCESS) { - return result; - } - result = put_uint16_le_into_buffer(buffer, super->num_indexes); - if (result != UDS_SUCCESS) { - return result; - } - result = put_uint16_le_into_buffer(buffer, super->max_saves); - if (result != UDS_SUCCESS) { - return result; - } - result = zero_bytes(buffer, 4); // aligment - if (result != UDS_SUCCESS) { - return result; - } - result = put_uint64_le_into_buffer(buffer, super->open_chapter_blocks); - if (result != UDS_SUCCESS) { - return result; - } - result = put_uint64_le_into_buffer(buffer, super->page_map_blocks); - if (result != UDS_SUCCESS) { - return result; - } - if (is_converted_super_block(super)) { - result = put_uint64_le_into_buffer(buffer, - super->volume_offset); - if (result != UDS_SUCCESS) { - return result; - } - result = put_uint64_le_into_buffer(buffer, - super->start_offset); - if (result != UDS_SUCCESS) { - return result; - } - } - return ASSERT_LOG_ONLY(content_length(buffer) == buffer_length(buffer), - "%zu bytes encoded, of %zu expected", - content_length(buffer), buffer_length(buffer)); -} - -/**********************************************************************/ -static int __must_check -write_single_file_header(struct index_layout *layout, - struct region_table *table, - unsigned int num_regions, - struct buffered_writer *writer) -{ - unsigned int i; - uint16_t payload; - size_t table_size = sizeof(struct region_table) + num_regions * - sizeof(struct layout_region); - struct buffer *buffer; - int result; - - if (is_converted_super_block(&layout->super)) { - payload = sizeof(struct super_block_data); - } else { - payload = sizeof(struct super_block_data) - - sizeof(layout->super.volume_offset) - - sizeof(layout->super.start_offset); - } - - table->header = (struct region_header) { - .magic = REGION_MAGIC, - .region_blocks = layout->total_blocks, - .type = RH_TYPE_SUPER, - .version = 1, - .num_regions = num_regions, - .payload = payload, - }; - - result = make_buffer(table_size, &buffer); - if (result != UDS_SUCCESS) { - return result; - } - - result = encode_region_header(buffer, &table->header); - - for (i = 0; i < num_regions; i++) { - if (result == UDS_SUCCESS) { - result = encode_layout_region(buffer, - &table->regions[i]); - } - } - - if (result == UDS_SUCCESS) { - result = write_to_buffered_writer(writer, - get_buffer_contents(buffer), - content_length(buffer)); - } - - free_buffer(UDS_FORGET(buffer)); - if (result != UDS_SUCCESS) { - return result; - } - - result = make_buffer(payload, &buffer); - if (result != UDS_SUCCESS) { - return result; - } - - result = encode_super_block_data(buffer, &layout->super); - if (result != UDS_SUCCESS) { - free_buffer(UDS_FORGET(buffer)); - return result; - } - - result = write_to_buffered_writer(writer, get_buffer_contents(buffer), - content_length(buffer)); - free_buffer(UDS_FORGET(buffer)); - if (result != UDS_SUCCESS) { - return result; - } - return flush_buffered_writer(writer); -} - -/** - * Save an index layout table to persistent storage using the io_factory in - * the layout. - * - * @param layout The layout to save - * @param offset A block offset to apply when writing the layout - * - * @return UDS_SUCCESS or an error code - */ -static int __must_check -save_single_file_layout(struct index_layout *layout, off_t offset) -{ - struct buffered_writer *writer = NULL; - struct region_table *table; - unsigned int num_regions; - - int result = make_single_file_region_table(layout, - &num_regions, - &table); - if (result != UDS_SUCCESS) { - return result; - } - - result = open_layout_writer(layout, &layout->header, offset, &writer); - if (result != UDS_SUCCESS) { - UDS_FREE(table); - return result; - } - - result = write_single_file_header(layout, table, num_regions, writer); - UDS_FREE(table); - free_buffered_writer(writer); - - return result; -} - -/**********************************************************************/ -void put_uds_index_layout(struct index_layout *layout) -{ - struct sub_index_layout *sil; - - if ((layout == NULL) || (--layout->ref_count > 0)) { - return; - } - - sil = &layout->index; - if (sil->saves != NULL) { - unsigned int j; - for (j = 0; j < layout->super.max_saves; ++j) { - struct index_save_layout *isl = &sil->saves[j]; - UDS_FREE(isl->volume_index_zones); - UDS_FREE(isl->open_chapter); - free_buffer(UDS_FORGET(isl->index_state_buffer)); - } - } - - UDS_FREE(sil->saves); - - if (layout->factory != NULL) { - put_uds_io_factory(layout->factory); - } - UDS_FREE(layout); -} - -/**********************************************************************/ -void get_uds_index_layout(struct index_layout *layout, - struct index_layout **layout_ptr) -{ - ++layout->ref_count; - *layout_ptr = layout; -} - -/**********************************************************************/ -int write_uds_index_config(struct index_layout *layout, - struct uds_configuration *config, - off_t offset) -{ - struct buffered_writer *writer = NULL; - int result = open_layout_writer(layout, &layout->config, - offset, &writer); - if (result != UDS_SUCCESS) { - return uds_log_error_strerror(result, - "failed to open config region"); - } - - result = write_config_contents(writer, config, layout->super.version); - if (result != UDS_SUCCESS) { - free_buffered_writer(writer); - return uds_log_error_strerror(result, - "failed to write config region"); - } - result = flush_buffered_writer(writer); - if (result != UDS_SUCCESS) { - free_buffered_writer(writer); - return uds_log_error_strerror(result, - "cannot flush config writer"); - } - free_buffered_writer(writer); - return UDS_SUCCESS; -} - -/**********************************************************************/ -int verify_uds_index_config(struct index_layout *layout, - struct uds_configuration *config) -{ - struct buffered_reader *reader = NULL; - struct uds_configuration stored_config; - uint64_t offset = layout->super.volume_offset - - layout->super.start_offset; - int result = open_layout_reader(layout, &layout->config, - offset, - &reader); - if (result != UDS_SUCCESS) { - return uds_log_error_strerror(result, - "failed to open config reader"); - } - - result = read_config_contents(reader, &stored_config); - if (result != UDS_SUCCESS) { - free_buffered_reader(reader); - return uds_log_error_strerror(result, - "failed to read config region"); - } - free_buffered_reader(reader); - - if (!are_uds_configurations_equal(&stored_config, config)) { - uds_log_warning("Supplied configuration does not match save"); - return UDS_NO_INDEX; - } - - *config = stored_config; - return UDS_SUCCESS; -} - -/**********************************************************************/ -int replace_index_layout_storage(struct index_layout *layout, - const char *name) -{ - return replace_uds_storage(layout->factory, name); -} - -/**********************************************************************/ -int open_uds_volume_bufio(struct index_layout *layout, - size_t block_size, - unsigned int reserved_buffers, - struct dm_bufio_client **client_ptr) -{ - off_t offset = (layout->index.volume.start_block + - layout->super.volume_offset - - layout->super.start_offset) * - layout->super.block_size; - return make_uds_bufio(layout->factory, - offset, - block_size, - reserved_buffers, - client_ptr); -} - -/**********************************************************************/ -uint64_t get_uds_volume_nonce(struct index_layout *layout) -{ - return layout->index.nonce; -} - -/**********************************************************************/ -static uint64_t generate_index_save_nonce(uint64_t volume_nonce, - struct index_save_layout *isl) -{ - struct SaveNonceData { - struct index_save_data data; - uint64_t offset; - } nonce_data; - byte buffer[sizeof(nonce_data)]; - size_t offset = 0; - - nonce_data.data = isl->save_data; - nonce_data.data.nonce = 0; - nonce_data.offset = isl->index_save.start_block; - - encode_uint64_le(buffer, &offset, nonce_data.data.timestamp); - encode_uint64_le(buffer, &offset, nonce_data.data.nonce); - encode_uint32_le(buffer, &offset, nonce_data.data.version); - encode_uint32_le(buffer, &offset, 0U); // padding - encode_uint64_le(buffer, &offset, nonce_data.offset); - ASSERT_LOG_ONLY(offset == sizeof(nonce_data), - "%zu bytes encoded of %zu expected", - offset, - sizeof(nonce_data)); - return generate_secondary_nonce(volume_nonce, buffer, sizeof(buffer)); -} - -/**********************************************************************/ -static int validate_index_save_layout(struct index_save_layout *isl, - uint64_t volume_nonce, - uint64_t *save_time_ptr) -{ - if (isl->save_type == NO_SAVE || isl->num_zones == 0 || - isl->save_data.timestamp == 0) { - return UDS_BAD_STATE; - } - if (isl->save_data.nonce != - generate_index_save_nonce(volume_nonce, isl)) { - return UDS_BAD_STATE; - } - if (save_time_ptr != NULL) { - *save_time_ptr = isl->save_data.timestamp; - } - return UDS_SUCCESS; -} - -/**********************************************************************/ -static int __must_check -select_oldest_index_save_layout(struct sub_index_layout *sil, - unsigned int max_saves, - struct index_save_layout **isl_ptr) -{ - struct index_save_layout *oldest = NULL; - uint64_t oldest_time = 0; - int result; - - // find the oldest valid or first invalid slot - struct index_save_layout *isl; - for (isl = sil->saves; isl < sil->saves + max_saves; ++isl) { - uint64_t save_time = 0; - int result = - validate_index_save_layout(isl, sil->nonce, &save_time); - if (result != UDS_SUCCESS) { - save_time = 0; - } - if (oldest == NULL || save_time < oldest_time) { - oldest = isl; - oldest_time = save_time; - } - } - - result = ASSERT((oldest != NULL), "no oldest or free save slot"); - if (result != UDS_SUCCESS) { - return result; - } - *isl_ptr = oldest; - return UDS_SUCCESS; -} - -/**********************************************************************/ -static int __must_check -select_latest_index_save_layout(struct sub_index_layout *sil, - unsigned int max_saves, - struct index_save_layout **isl_ptr) -{ - struct index_save_layout *latest = NULL; - uint64_t latest_time = 0; - - // find the latest valid save slot - struct index_save_layout *isl; - for (isl = sil->saves; isl < sil->saves + max_saves; ++isl) { - uint64_t save_time = 0; - int result = - validate_index_save_layout(isl, sil->nonce, &save_time); - if (result != UDS_SUCCESS) { - continue; - } - if (save_time > latest_time) { - latest = isl; - latest_time = save_time; - } - } - - if (latest == NULL) { - uds_log_error("No valid index save found"); - return UDS_INDEX_NOT_SAVED_CLEANLY; - } - *isl_ptr = latest; - return UDS_SUCCESS; -} - -/**********************************************************************/ -static int __must_check -instantiate_index_save_layout(struct index_save_layout *isl, - struct super_block_data *super, - uint64_t volume_nonce, - unsigned int num_zones, - enum index_save_type save_type) -{ - int result = UDS_SUCCESS; - if (isl->open_chapter && save_type == IS_CHECKPOINT) { - UDS_FREE(isl->open_chapter); - isl->open_chapter = NULL; - } else if (isl->open_chapter == NULL && save_type == IS_SAVE) { - result = UDS_ALLOCATE(1, - struct layout_region, - "open chapter layout", - &isl->open_chapter); - if (result != UDS_SUCCESS) { - return result; - } - } - if (num_zones != isl->num_zones) { - if (isl->volume_index_zones != NULL) { - UDS_FREE(isl->volume_index_zones); - } - result = UDS_ALLOCATE(num_zones, - struct layout_region, - "volume index zone layouts", - &isl->volume_index_zones); - if (result != UDS_SUCCESS) { - return result; - } - isl->num_zones = num_zones; - } - - populate_index_save_layout(isl, super, num_zones, save_type); - - result = make_buffer(INDEX_STATE_BUFFER_SIZE, &isl->index_state_buffer); - if (result != UDS_SUCCESS) { - return result; - } - - isl->read = isl->written = false; - isl->save_type = save_type; - memset(&isl->save_data, 0, sizeof(isl->save_data)); - isl->save_data.timestamp = - ktime_to_ms(current_time_ns(CLOCK_REALTIME)); - isl->save_data.version = 1; - isl->save_data.nonce = generate_index_save_nonce(volume_nonce, isl); - - return UDS_SUCCESS; -} - -/**********************************************************************/ -static int __must_check -invalidate_old_save(struct index_layout *layout, struct index_save_layout *isl) -{ - uint64_t start_block = isl->index_save.start_block; - uint64_t save_blocks = isl->index_save.num_blocks; - unsigned int save = isl->index_save.instance; - - int result = reset_index_save_layout(isl, - &start_block, - save_blocks, - layout->super.page_map_blocks, - save); - if (result != UDS_SUCCESS) { - return result; - } - - return write_index_save_layout(layout, isl); -} - -/**********************************************************************/ -int setup_uds_index_save_slot(struct index_layout *layout, - unsigned int num_zones, - enum index_save_type save_type, - unsigned int *save_slot_ptr) -{ - struct sub_index_layout *sil = &layout->index; - - struct index_save_layout *isl = NULL; - int result = select_oldest_index_save_layout(sil, - layout->super.max_saves, - &isl); - if (result != UDS_SUCCESS) { - return result; - } - - result = invalidate_old_save(layout, isl); - if (result != UDS_SUCCESS) { - return result; - } - - result = instantiate_index_save_layout(isl, &layout->super, sil->nonce, - num_zones, save_type); - if (result != UDS_SUCCESS) { - return result; - } - - *save_slot_ptr = isl - sil->saves; - return UDS_SUCCESS; -} - -/**********************************************************************/ -int find_latest_uds_index_save_slot(struct index_layout *layout, - unsigned int *num_zones_ptr, - unsigned int *slot_ptr) -{ - struct sub_index_layout *sil = &layout->index; - - struct index_save_layout *isl = NULL; - int result = select_latest_index_save_layout(sil, - layout->super.max_saves, - &isl); - if (result != UDS_SUCCESS) { - return result; - } - - if (num_zones_ptr != NULL) { - *num_zones_ptr = isl->num_zones; - } - if (slot_ptr != NULL) { - *slot_ptr = isl - sil->saves; - } - return UDS_SUCCESS; -} - -/**********************************************************************/ -static int __must_check -make_index_save_region_table(struct index_save_layout *isl, - unsigned int *num_regions_ptr, - struct region_table **table_ptr) -{ - unsigned int z; - struct region_table *table; - struct layout_region *lr; - int result; - unsigned int num_regions = 1 + // header - 1 + // index page map - isl->num_zones + // volume index zones - (bool) isl->open_chapter; // open chapter if - // needed - - if (isl->free_space.num_blocks > 0) { - num_regions++; - } - - result = UDS_ALLOCATE_EXTENDED(struct region_table, - num_regions, - struct layout_region, - "layout region table for ISL", - &table); - if (result != UDS_SUCCESS) { - return result; - } - - lr = &table->regions[0]; - *lr++ = isl->header; - *lr++ = isl->index_page_map; - for (z = 0; z < isl->num_zones; ++z) { - *lr++ = isl->volume_index_zones[z]; - } - if (isl->open_chapter) { - *lr++ = *isl->open_chapter; - } - if (isl->free_space.num_blocks > 0) { - *lr++ = isl->free_space; - } - - result = ASSERT((lr == &table->regions[num_regions]), - "incorrect number of ISL regions"); - if (result != UDS_SUCCESS) { - return result; - } - - *num_regions_ptr = num_regions; - *table_ptr = table; - return UDS_SUCCESS; -} - -/**********************************************************************/ -static unsigned int region_type_for_save_type(enum index_save_type save_type) -{ - switch (save_type) { - case IS_SAVE: - return RH_TYPE_SAVE; - - case IS_CHECKPOINT: - return RH_TYPE_CHECKPOINT; - - default: - break; - } - - return RH_TYPE_UNSAVED; -} - -/**********************************************************************/ -static int __must_check -write_index_save_header(struct index_save_layout *isl, - struct region_table *table, - unsigned int num_regions, - struct buffered_writer *writer) -{ - unsigned int i; - struct buffer *buffer; - int result; - size_t payload = sizeof(isl->save_data); - size_t table_size = sizeof(struct region_table) + - num_regions * sizeof(struct layout_region); - - if (isl->index_state_buffer != NULL) { - payload += content_length(isl->index_state_buffer); - } - - table->header = (struct region_header) { - .magic = REGION_MAGIC, - .region_blocks = isl->index_save.num_blocks, - .type = region_type_for_save_type(isl->save_type), - .version = 1, - .num_regions = num_regions, - .payload = payload, - }; - - result = make_buffer(table_size, &buffer); - if (result != UDS_SUCCESS) { - return result; - } - - result = encode_region_header(buffer, &table->header); - if (result != UDS_SUCCESS) { - free_buffer(UDS_FORGET(buffer)); - return result; - } - - for (i = 0; i < num_regions; i++) { - result = encode_layout_region(buffer, &table->regions[i]); - if (result != UDS_SUCCESS) { - free_buffer(UDS_FORGET(buffer)); - return result; - } - } - result = ASSERT_LOG_ONLY(content_length(buffer) == table_size, - "%zu bytes encoded of %zu expected", - content_length(buffer), - table_size); - if (result != UDS_SUCCESS) { - free_buffer(UDS_FORGET(buffer)); - return result; - } - - result = write_to_buffered_writer(writer, get_buffer_contents(buffer), - content_length(buffer)); - free_buffer(UDS_FORGET(buffer)); - if (result != UDS_SUCCESS) { - return result; - } - - result = make_buffer(sizeof(isl->save_data), &buffer); - if (result != UDS_SUCCESS) { - return result; - } - - result = encode_index_save_data(buffer, &isl->save_data); - if (result != UDS_SUCCESS) { - free_buffer(UDS_FORGET(buffer)); - return result; - } - - result = write_to_buffered_writer(writer, get_buffer_contents(buffer), - content_length(buffer)); - free_buffer(UDS_FORGET(buffer)); - if (result != UDS_SUCCESS) { - return result; - } - - if (isl->index_state_buffer != NULL) { - result = write_to_buffered_writer(writer, - get_buffer_contents(isl->index_state_buffer), - content_length(isl->index_state_buffer)); - if (result != UDS_SUCCESS) { - return result; - } - } - - return flush_buffered_writer(writer); -} - -/**********************************************************************/ -static int write_index_save_layout(struct index_layout *layout, - struct index_save_layout *isl) -{ - unsigned int num_regions; - struct region_table *table; - struct buffered_writer *writer = NULL; - int result = make_index_save_region_table(isl, &num_regions, &table); - if (result != UDS_SUCCESS) { - return result; - } - - result = open_layout_writer(layout, &isl->header, - -layout->super.start_offset, - &writer); - if (result != UDS_SUCCESS) { - UDS_FREE(table); - return result; - } - - result = write_index_save_header(isl, table, num_regions, writer); - UDS_FREE(table); - free_buffered_writer(writer); - - isl->written = true; - return result; -} - -/**********************************************************************/ -int commit_uds_index_save(struct index_layout *layout, unsigned int save_slot) -{ - struct index_save_layout *isl; - int result = ASSERT((save_slot < layout->super.max_saves), - "save slot out of range"); - if (result != UDS_SUCCESS) { - return result; - } - - isl = &layout->index.saves[save_slot]; - - if (buffer_used(isl->index_state_buffer) == 0) { - return uds_log_error_strerror(UDS_UNEXPECTED_RESULT, - "%s: no index state data saved", - __func__); - } - - return write_index_save_layout(layout, isl); -} - -/**********************************************************************/ - -static void mutilate_index_save_info(struct index_save_layout *isl) -{ - memset(&isl->save_data, 0, sizeof(isl->save_data)); - isl->read = isl->written = 0; - isl->save_type = NO_SAVE; - isl->num_zones = 0; - free_buffer(UDS_FORGET(isl->index_state_buffer)); -} - -/**********************************************************************/ -int cancel_uds_index_save(struct index_layout *layout, unsigned int save_slot) -{ - int result = ASSERT((save_slot < layout->super.max_saves), - "save slot out of range"); - if (result != UDS_SUCCESS) { - return result; - } - - mutilate_index_save_info(&layout->index.saves[save_slot]); - - return UDS_SUCCESS; -} - -/**********************************************************************/ -int discard_uds_index_saves(struct index_layout *layout, bool all) -{ - int result = UDS_SUCCESS; - struct sub_index_layout *sil = &layout->index; - - if (all) { - unsigned int i; - for (i = 0; i < layout->super.max_saves; ++i) { - struct index_save_layout *isl = &sil->saves[i]; - result = first_error(result, - invalidate_old_save(layout, isl)); - } - } else { - struct index_save_layout *isl; - uint16_t max_saves = layout->super.max_saves; - result = select_latest_index_save_layout(sil, - max_saves, - &isl); - if (result == UDS_SUCCESS) { - result = invalidate_old_save(layout, isl); - } - } - - return result; -} - -/**********************************************************************/ -static int create_index_layout(struct index_layout *layout, - uint64_t size, - const struct uds_configuration *config) -{ - struct save_layout_sizes sizes; - uint64_t index_size; - int result; - - if (config == NULL) { - uds_log_error("missing index configuration"); - return -EINVAL; - } - - result = compute_sizes(&sizes, config, UDS_BLOCK_SIZE); - if (result != UDS_SUCCESS) { - return result; - } - // XXX This should include offset in the calculation (size + - // offset is checked later so insufficient space will be - // caught eventually, but better to catch it early) - index_size = sizes.total_blocks * sizes.block_size; - if (size < index_size) { - uds_log_error("layout requires at least %llu bytes", - (unsigned long long) index_size); - return -ENOSPC; - } - - result = init_single_file_layout(layout, layout->offset, size, &sizes); - if (result != UDS_SUCCESS) { - return result; - } - - result = save_sub_index_regions(layout); - if (result != UDS_SUCCESS) { - return result; - } - - result = save_single_file_layout(layout, 0); - if (result != UDS_SUCCESS) { - return result; - } - return UDS_SUCCESS; -} - -/**********************************************************************/ -struct buffer *get_uds_index_state_buffer(struct index_layout *layout, - unsigned int slot) -{ - return layout->index.saves[slot].index_state_buffer; -} - -/**********************************************************************/ -static int find_layout_region(struct index_layout *layout, - unsigned int slot, - const char *operation, - enum region_kind kind, - unsigned int zone, - struct layout_region **lr_ptr) -{ - struct index_save_layout *isl = &layout->index.saves[slot]; - struct layout_region *lr = NULL; - int result = ASSERT((slot < layout->super.max_saves), "%s not started", - operation); - if (result != UDS_SUCCESS) { - return result; - } - - switch (kind) { - case RL_KIND_INDEX_PAGE_MAP: - lr = &isl->index_page_map; - break; - - case RL_KIND_OPEN_CHAPTER: - if (isl->open_chapter == NULL) { - return uds_log_error_strerror(UDS_UNEXPECTED_RESULT, - "%s: %s has no open chapter", - __func__, - operation); - } - lr = isl->open_chapter; - break; - - case RL_KIND_VOLUME_INDEX: - if (isl->volume_index_zones == NULL || zone >= isl->num_zones) { - return uds_log_error_strerror(UDS_UNEXPECTED_RESULT, - "%s: %s has no volume index zone %u", - __func__, - operation, - zone); - } - lr = &isl->volume_index_zones[zone]; - break; - - default: - return uds_log_error_strerror(UDS_INVALID_ARGUMENT, - "%s: unexpected kind %u", - __func__, - kind); - } - - *lr_ptr = lr; - return UDS_SUCCESS; -} - -/**********************************************************************/ -int open_uds_index_buffered_reader(struct index_layout *layout, - unsigned int slot, - enum region_kind kind, - unsigned int zone, - struct buffered_reader **reader_ptr) -{ - struct layout_region *lr = NULL; - int result = find_layout_region(layout, slot, "load", kind, zone, &lr); - if (result != UDS_SUCCESS) { - return result; - } - return open_layout_reader(layout, lr, -layout->super.start_offset, - reader_ptr); -} - -/**********************************************************************/ -int open_uds_index_buffered_writer(struct index_layout *layout, - unsigned int slot, - enum region_kind kind, - unsigned int zone, - struct buffered_writer **writer_ptr) -{ - struct layout_region *lr = NULL; - int result = find_layout_region(layout, slot, "save", kind, zone, &lr); - if (result != UDS_SUCCESS) { - return result; - } - return open_layout_writer(layout, lr, -layout->super.start_offset, - writer_ptr); -} - -/**********************************************************************/ -int make_uds_index_layout_from_factory(struct io_factory *factory, - off_t offset, - uint64_t named_size, - bool new_layout, - const struct uds_configuration *config, - struct index_layout **layout_ptr) -{ - struct index_layout *layout = NULL; - uint64_t config_size; - int result; - // Get the device size and round it down to a multiple of - // UDS_BLOCK_SIZE. - size_t size = get_uds_writable_size(factory) & -UDS_BLOCK_SIZE; - if (named_size > size) { - uds_log_error("index storage (%zu) is smaller than the requested size %llu", - size, - (unsigned long long) named_size); - return -ENOSPC; - } - if ((named_size > 0) && (named_size < size)) { - size = named_size; - } - - // Get the index size according the the config - result = uds_compute_index_size(config, &config_size); - if (result != UDS_SUCCESS) { - return result; - } - if (size < config_size) { - uds_log_error("index storage (%zu) is smaller than the required size %llu", - size, - (unsigned long long) config_size); - return -ENOSPC; - } - size = config_size; - - result = UDS_ALLOCATE(1, struct index_layout, __func__, &layout); - if (result != UDS_SUCCESS) { - return result; - } - layout->ref_count = 1; - get_uds_io_factory(factory); - layout->factory = factory; - layout->offset = offset; - - if (new_layout) { - // Populate the layout from the UDS configuration - result = create_index_layout(layout, size, config); - } else { - // Populate the layout from the saved index. - result = load_index_layout(layout); - } - if (result != UDS_SUCCESS) { - put_uds_index_layout(layout); - return result; - } - - *layout_ptr = layout; - return UDS_SUCCESS; -} - -/**********************************************************************/ -int update_uds_layout(struct index_layout *layout, - struct uds_configuration *config, - off_t lvm_offset, - off_t offset) -{ - int result = UDS_SUCCESS; - off_t offset_blocks = offset / UDS_BLOCK_SIZE; - off_t lvm_blocks = lvm_offset / UDS_BLOCK_SIZE; - struct super_block_data super = layout->super; - struct sub_index_layout index = layout->index; - layout->super.start_offset = lvm_blocks; - layout->super.volume_offset = offset_blocks; - layout->index.sub_index.num_blocks -= offset_blocks; - layout->index.volume.num_blocks -= offset_blocks; - layout->total_blocks -= offset_blocks; - layout->super.version = 7; - result = save_single_file_layout(layout, offset_blocks); - if (result == UDS_SUCCESS) { - result = write_uds_index_config(layout, config, offset_blocks); - } - layout->index = index; - layout->super = super; - return result; -} diff --git a/uds/indexLayout.h b/uds/indexLayout.h deleted file mode 100644 index 87cc78b1..00000000 --- a/uds/indexLayout.h +++ /dev/null @@ -1,263 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/krusty/src/uds/indexLayout.h#35 $ - */ - -#ifndef INDEX_LAYOUT_H -#define INDEX_LAYOUT_H - -#include "buffer.h" -#include "indexConfig.h" -#include "indexState.h" -#include "ioFactory.h" -#include "uds.h" - -struct index_layout; - -/** - * Construct an index layout. This is a platform specific function that uses - * the name string, a flag that indicates old vs. new indices, and a - * UDS configuration (for new indices) to make an IO factory and invoke - * make_uds_index_layout_from_factory. - * - * @param name String naming the index. Each platform will use its own - * conventions to interpret the string, but in general it is - * a space-separated sequence of param=value settings. For - * backward compatibility a string without an equals is - * treated as a platform-specific default parameter value. - * @param new_layout Whether this is a new layout. - * @param config The UDS configuration required for a new layout. - * @param layout_ptr Where to store the new index layout - * - * @return UDS_SUCCESS or an error code. - **/ -int __must_check make_uds_index_layout(const char *name, - bool new_layout, - const struct uds_configuration *config, - struct index_layout **layout_ptr); - -/** - * Construct an index layout using an IO factory. This method is - * common to all platforms. - * - * @param factory The IO factory for the block storage containing the - * index. - * @param offset The offset of the start of the index within the block - * storage address space. - * @param named_size The size in bytes of the space within the block storage - * address space, as specified in the name string. - * @param new_layout Whether this is a new layout. - * @param config The UDS configuration required for a new layout. - * @param layout_ptr Where to store the new index layout - * - * @return UDS_SUCCESS or an error code. - **/ -int __must_check -make_uds_index_layout_from_factory(struct io_factory *factory, - off_t offset, - uint64_t named_size, - bool new_layout, - const struct uds_configuration *config, - struct index_layout **layout_ptr); - -/** - * Decrement the use count of an index layout. If the count goes to zero, free - * the index layout. - * - * @param layout The layout to release or free - **/ -void put_uds_index_layout(struct index_layout *layout); - -/** - * Replace the backing store for the layout. - * - * @param layout The layout - * @param name A name describing the new backing store - * - * @return UDS_SUCCESS or an error code - **/ -int __must_check replace_index_layout_storage(struct index_layout *layout, - const char *name); - -/**********************************************************************/ -int __must_check cancel_uds_index_save(struct index_layout *layout, - unsigned int save_slot); - -/**********************************************************************/ -int __must_check commit_uds_index_save(struct index_layout *layout, - unsigned int save_slot); - -/**********************************************************************/ -int __must_check discard_uds_index_saves(struct index_layout *layout, - bool all); - -/** - * Find the latest index save slot. - * - * @param [in] layout The single file layout. - * @param [out] num_zones_ptr Where to store the actual number of zones - * that were saved. - * @param [out] slot_ptr Where to store the slot number we found. - * - * @return UDS_SUCCESS or an error code. - **/ -int __must_check find_latest_uds_index_save_slot(struct index_layout *layout, - unsigned int *num_zones_ptr, - unsigned int *slot_ptr); - -/** - * Get another reference to an index layout, incrementing its use count. - * - * @param layout The index layout. - * @param layout_ptr Where the new layout pointer is being stored. - **/ -void get_uds_index_layout(struct index_layout *layout, - struct index_layout **layout_ptr); - -/** - * Open a buffered reader for a specified state, kind, and zone. - * - * @param layout The index layout - * @param slot The save slot - * @param kind The kind of index save region to open. - * @param zone The zone number for the region. - * @param reader_ptr Where to store the buffered reader. - * - * @return UDS_SUCCESS or an error code. - **/ -int __must_check -open_uds_index_buffered_reader(struct index_layout *layout, - unsigned int slot, - enum region_kind kind, - unsigned int zone, - struct buffered_reader **reader_ptr); - -/** - * Open a buffered writer for a specified state, kind, and zone. - * - * @param layout The index layout - * @param slot The save slot - * @param kind The kind of index save region to open. - * @param zone The zone number for the region. - * @param writer_ptr Where to store the buffered writer. - * - * @return UDS_SUCCESS or an error code. - **/ -int __must_check -open_uds_index_buffered_writer(struct index_layout *layout, - unsigned int slot, - enum region_kind kind, - unsigned int zone, - struct buffered_writer **writer_ptr); - -/** - * Obtain the nonce to be used to store or validate the loading of volume index - * pages. - * - * @param [in] layout The index layout. - * - * @return The nonce to use. - **/ -uint64_t __must_check get_uds_volume_nonce(struct index_layout *layout); - -/** - * Obtain a dm_bufio_client for the specified index volume. - * - * @param [in] layout The index layout. - * @param [in] block_size The size of a volume page - * @param [in] reserved_buffers The count of reserved buffers - * @param [out] client_ptr Where to put the new dm_bufio_client - * - * @return UDS_SUCCESS or an error code. - **/ -int __must_check open_uds_volume_bufio(struct index_layout *layout, - size_t block_size, - unsigned int reserved_buffers, - struct dm_bufio_client **client_ptr); - -/** - * Read the index configuration, and verify that it matches the given - * configuration. - * - * @param layout the generic index layout - * @param config the index configuration - * - * @return UDS_SUCCESS or an error code - **/ -int __must_check verify_uds_index_config(struct index_layout *layout, - struct uds_configuration *config); - -/** - * Determine which index save slot to use for a new index save. - * - * Also allocates the volume index regions and, if needed, the openChapter - * region. - * - * @param [in] layout The index layout. - * @param [in] num_zones Actual number of zones currently in use. - * @param [in] save_type The index save type. - * @param [out] save_slot_ptr Where to store the save slot number. - * - * @return UDS_SUCCESS or an error code - **/ -int __must_check setup_uds_index_save_slot(struct index_layout *layout, - unsigned int num_zones, - enum index_save_type save_type, - unsigned int *save_slot_ptr); - -/** - * Write the index configuration. - * - * @param layout the generic index layout - * @param config the index configuration to write - * @param offset A block offset to apply when writing the configuration - * - * @return UDS_SUCCESS or an error code - **/ -int __must_check write_uds_index_config(struct index_layout *layout, - struct uds_configuration *config, - off_t offset); - -/** - * Get the index state buffer - * - * @param layout the index layout - * @param slot the save slot - * - * @return UDS_SUCCESS or an error code - **/ -struct buffer *__must_check -get_uds_index_state_buffer(struct index_layout *layout, unsigned int slot); - -/** - * Update and write out an index layout and configuration with a block offset - * - * @param layout The index_layout to be reconfigured - * @param config The configuration to be written with the layout - * @param lvm_offset The adjustment for lvm space, in bytes - * @param offset The offset in bytes to move the index - * - * @return UDS_SUCCESS or a error code - */ -int update_uds_layout(struct index_layout *layout, - struct uds_configuration *config, - off_t lvm_offset, - off_t offset); - -#endif // INDEX_LAYOUT_H diff --git a/uds/indexLayoutLinuxKernel.c b/uds/indexLayoutLinuxKernel.c deleted file mode 100644 index c33442c4..00000000 --- a/uds/indexLayoutLinuxKernel.c +++ /dev/null @@ -1,73 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/krusty/kernelLinux/uds/indexLayoutLinuxKernel.c#16 $ - */ - -#include "indexLayout.h" -#include "indexLayoutParser.h" -#include "memoryAlloc.h" - -/**********************************************************************/ -int make_uds_index_layout(const char *name, - bool new_layout, - const struct uds_configuration *config, - struct index_layout **layout_ptr) -{ - char *dev = NULL; - uint64_t offset = 0; - uint64_t size = 0; - char *params = NULL; - struct io_factory *factory = NULL; - struct index_layout *layout; - int result; - - struct layout_parameter parameter_table[] = { - { "dev", LP_STRING | LP_DEFAULT, { .str = &dev } }, - { "offset", LP_UINT64, { .num = &offset } }, - { "size", LP_UINT64, { .num = &size } }, - LP_NULL_PARAMETER, - }; - - result = uds_duplicate_string(name, "make_uds_index_layout parameters", - ¶ms); - if (result != UDS_SUCCESS) { - return result; - } - - // note dev will be set to memory owned by params - result = parse_layout_string(params, parameter_table); - if (result != UDS_SUCCESS) { - UDS_FREE(params); - return result; - } - - result = make_uds_io_factory(dev, &factory); - UDS_FREE(params); - if (result != UDS_SUCCESS) { - return result; - } - result = make_uds_index_layout_from_factory( - factory, offset, size, new_layout, config, &layout); - put_uds_io_factory(factory); - if (result != UDS_SUCCESS) { - return result; - } - *layout_ptr = layout; - return UDS_SUCCESS; -} diff --git a/uds/indexLayoutParser.c b/uds/indexLayoutParser.c deleted file mode 100644 index f6d0df58..00000000 --- a/uds/indexLayoutParser.c +++ /dev/null @@ -1,103 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/krusty/src/uds/indexLayoutParser.c#14 $ - */ - -#include "indexLayoutParser.h" - -#include "errors.h" -#include "logger.h" -#include "permassert.h" -#include "stringUtils.h" -#include "typeDefs.h" -#include "uds.h" - -/**********************************************************************/ -static int __must_check set_parameter_value(struct layout_parameter *lp, - char *data) -{ - if ((lp->type & LP_TYPE_MASK) == LP_UINT64) { - int result = uds_parse_uint64(data, lp->value.num); - if (result != UDS_SUCCESS) { - uds_log_error("bad numeric value %s", data); - return -EINVAL; - } - } else if ((lp->type & LP_TYPE_MASK) == LP_STRING) { - *lp->value.str = data; - } else { - uds_log_error("unknown layout parameter type code %x", - (lp->type & LP_TYPE_MASK)); - return -EINVAL; - } - return UDS_SUCCESS; -} - -/**********************************************************************/ -int parse_layout_string(char *info, struct layout_parameter *params) -{ - if (!strchr(info, '=')) { - struct layout_parameter *lp; - for (lp = params; lp->type != LP_NULL; ++lp) { - if (lp->type & LP_DEFAULT) { - int result = set_parameter_value(lp, info); - if (result != UDS_SUCCESS) { - return result; - } - break; - } - } - } else { - char *data = NULL; - char *token; - for (token = uds_next_token(info, " ", &data); token; - token = uds_next_token(NULL, " ", &data)) { - int result; - char *equal = strchr(token, '='); - struct layout_parameter *lp; - for (lp = params; lp->type != LP_NULL; ++lp) { - if (!equal && (lp->type & LP_DEFAULT)) { - break; - } else if (strncmp(token, - lp->name, - equal - token) == 0 && - strlen(lp->name) == - (size_t)(equal - token)) { - break; - } - } - if (lp->type == LP_NULL) { - uds_log_error("unknown index parameter %s", - token); - return -EINVAL; - } - if (lp->seen) { - uds_log_error("duplicate index parameter %s", - token); - return -EINVAL; - } - lp->seen = true; - result = set_parameter_value( - lp, equal ? equal + 1 : token); - if (result != UDS_SUCCESS) { - return result; - } - } - } - return UDS_SUCCESS; -} diff --git a/uds/indexLayoutParser.h b/uds/indexLayoutParser.h deleted file mode 100644 index 53a1d823..00000000 --- a/uds/indexLayoutParser.h +++ /dev/null @@ -1,75 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/krusty/src/uds/indexLayoutParser.h#9 $ - */ - -#ifndef INDEX_LAYOUT_PARSER_H -#define INDEX_LAYOUT_PARSER_H - -#include "compiler.h" -#include "typeDefs.h" - -enum lp_type { - LP_NULL = 0x000, - LP_STRING = 0x001, - LP_UINT64 = 0x002, - LP_TYPE_MASK = 0x0FF, - LP_DEFAULT = 0x100, -}; - -#define LP_NULL_PARAMETER { NULL, LP_NULL, { .num = NULL }, 0 } - -struct layout_parameter { - const char *name; - enum lp_type type; - union { - char **str; - uint64_t *num; - } value; - bool seen; -}; - -/** - * Function to parse an index layout specification. - * - * This parser treats the specification as a set of name=value - * parameters or, in the absence of an '=' character, a single value - * for a default parameter. The list of acceptable parameters is - * specified as an array of struct layout_parameter entries. Each such - * parameter contains the address of the variable in which the value - * is to be stored. - * - * @param info A copy of the index layout specification that - * will be altered by the parser to insert null - * characters after each value. Note that string - * parameter values will point into the memory of - * this string, so this specification cannot be - * deallocated until all uses of the parameter - * values are over. - * @param params The table of parameters the caller expects to - * find in the ``info'' string. Currently this - * parser can handle string and uint64_t values. - * Must be terminated by a LP_NULL_PARAMETER. - * - * @return UDS_SUCCESS or -EINVAL for parsing errors - **/ -int __must_check -parse_layout_string(char *info, struct layout_parameter *params); - -#endif // INDEX_LAYOUT_PARSER_H diff --git a/uds/indexPageMap.c b/uds/indexPageMap.c deleted file mode 100644 index 6165662d..00000000 --- a/uds/indexPageMap.c +++ /dev/null @@ -1,391 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/krusty/src/uds/indexPageMap.c#32 $ - */ - -#include "indexPageMap.h" - -#include "buffer.h" -#include "bufferedWriter.h" -#include "compiler.h" -#include "errors.h" -#include "hashUtils.h" -#include "indexComponent.h" -#include "logger.h" -#include "memoryAlloc.h" -#include "permassert.h" -#include "stringUtils.h" -#include "uds-threads.h" -#include "uds.h" - -static int read_index_page_map(struct read_portal *portal); -static int write_index_page_map(struct index_component *component, - struct buffered_writer *writer, - unsigned int zone); - -static const byte INDEX_PAGE_MAP_MAGIC[] = "ALBIPM02"; -enum { - INDEX_PAGE_MAP_MAGIC_LENGTH = sizeof(INDEX_PAGE_MAP_MAGIC) - 1, -}; - -const struct index_component_info INDEX_PAGE_MAP_INFO = { - .kind = RL_KIND_INDEX_PAGE_MAP, - .name = "index page map", - .save_only = false, - .chapter_sync = true, - .multi_zone = false, - .io_storage = true, - .loader = read_index_page_map, - .saver = write_index_page_map, - .incremental = NULL, -}; - -/**********************************************************************/ -static INLINE size_t num_entries(const struct geometry *geometry) -{ - return geometry->chapters_per_volume * - (geometry->index_pages_per_chapter - 1); -} - -/**********************************************************************/ -int make_index_page_map(const struct geometry *geometry, - struct index_page_map **map_ptr) -{ - struct index_page_map *map; - unsigned int delta_lists_per_chapter = - geometry->delta_lists_per_chapter; - int result = ASSERT_WITH_ERROR_CODE(((delta_lists_per_chapter - 1) <= - UINT16_MAX), - UDS_BAD_STATE, - "delta lists per chapter (%u) is too large", - delta_lists_per_chapter); - if (result != UDS_SUCCESS) { - return result; - } - - result = UDS_ALLOCATE(1, struct index_page_map, "Index Page Map", &map); - if (result != UDS_SUCCESS) { - return result; - } - - map->geometry = geometry; - - result = UDS_ALLOCATE(num_entries(geometry), - index_page_map_entry_t, - "Index Page Map Entries", - &map->entries); - if (result != UDS_SUCCESS) { - free_index_page_map(map); - return result; - } - - *map_ptr = map; - return UDS_SUCCESS; -} - -/**********************************************************************/ -void free_index_page_map(struct index_page_map *map) -{ - if (map != NULL) { - UDS_FREE(map->entries); - UDS_FREE(map); - } -} - -/**********************************************************************/ -uint64_t get_last_update(const struct index_page_map *map) -{ - return map->last_update; -} - -/**********************************************************************/ -int update_index_page_map(struct index_page_map *map, - uint64_t virtual_chapter_number, - unsigned int chapter_number, - unsigned int index_page_number, - unsigned int delta_list_number) -{ - size_t slot; - const struct geometry *geometry = map->geometry; - if ((virtual_chapter_number < map->last_update) || - (virtual_chapter_number > map->last_update + 1)) { - // if the last_update is 0, this is likely to be normal because - // we are replaying the volume - if (map->last_update != 0) { - uds_log_warning("unexpected index page map update, jumping from %llu to %llu", - (unsigned long long) map->last_update, - (unsigned long long) virtual_chapter_number); - } - } - map->last_update = virtual_chapter_number; - - if (chapter_number >= geometry->chapters_per_volume) { - return uds_log_error_strerror(UDS_INVALID_ARGUMENT, - "chapter number %u exceeds maximum %u", - chapter_number, - geometry->chapters_per_volume - 1); - } - if (index_page_number >= geometry->index_pages_per_chapter) { - return uds_log_error_strerror(UDS_INVALID_ARGUMENT, - "index page number %u exceeds maximum %u", - index_page_number, - geometry->index_pages_per_chapter - 1); - } - if (delta_list_number >= geometry->delta_lists_per_chapter) { - return uds_log_error_strerror(UDS_INVALID_ARGUMENT, - "delta list number %u exceeds maximum %u", - delta_list_number, - geometry->delta_lists_per_chapter - 1); - } - - if (index_page_number == (geometry->index_pages_per_chapter - 1)) { - /* - * There is no entry for the last index page of a chapter since - * its entry would always be - * geometry->delta_lists_per_chapter - 1. - */ - return UDS_SUCCESS; - } - - slot = (chapter_number * (geometry->index_pages_per_chapter - 1)) + - index_page_number; - map->entries[slot] = (index_page_map_entry_t) delta_list_number; - return UDS_SUCCESS; -} - -/**********************************************************************/ -int find_index_page_number(const struct index_page_map *map, - const struct uds_chunk_name *name, - unsigned int chapter_number, - unsigned int *index_page_number_ptr) -{ - int result; - unsigned int delta_list_number, slot, limit, index_page_number = 0; - const struct geometry *geometry = map->geometry; - if (chapter_number >= geometry->chapters_per_volume) { - return uds_log_error_strerror(UDS_INVALID_ARGUMENT, - "chapter number %u exceeds maximum %u", - chapter_number, - geometry->chapters_per_volume - 1); - } - - delta_list_number = hash_to_chapter_delta_list(name, geometry); - slot = (chapter_number * (geometry->index_pages_per_chapter - 1)); - limit = slot + (geometry->index_pages_per_chapter - 1); - for (; slot < limit; index_page_number++, slot++) { - if (delta_list_number <= map->entries[slot]) { - break; - } - } - - // This should be a clear post-condition of the loop above, but just in - // case it's not obvious, the check is cheap. - result = - ASSERT((index_page_number < geometry->index_pages_per_chapter), - "index page number too large"); - if (result != UDS_SUCCESS) { - return result; - } - - *index_page_number_ptr = index_page_number; - return UDS_SUCCESS; -} - -/**********************************************************************/ -int get_list_number_bounds(const struct index_page_map *map, - unsigned int chapter_number, - unsigned int index_page_number, - struct index_page_bounds *bounds) -{ - unsigned int slot; - const struct geometry *geometry = map->geometry; - int result = ASSERT((chapter_number < geometry->chapters_per_volume), - "chapter number is valid"); - if (result != UDS_SUCCESS) { - return result; - } - - result = ASSERT((index_page_number < geometry->index_pages_per_chapter), - "index page number is valid"); - if (result != UDS_SUCCESS) { - return result; - } - - slot = chapter_number * (geometry->index_pages_per_chapter - 1); - bounds->lowest_list = - ((index_page_number == 0) ? - 0 : - map->entries[slot + index_page_number - 1] + 1); - bounds->highest_list = - ((index_page_number == geometry->index_pages_per_chapter - 1) ? - geometry->delta_lists_per_chapter - 1 : - map->entries[slot + index_page_number]); - - return UDS_SUCCESS; -} - -/**********************************************************************/ -size_t index_page_map_size(const struct geometry *geometry) -{ - return sizeof(index_page_map_entry_t) * num_entries(geometry); -} - -/**********************************************************************/ -static int write_index_page_map(struct index_component *component, - struct buffered_writer *writer, - unsigned int zone) -{ - struct index_page_map *map; - struct buffer *buffer; - - int result = ASSERT((zone == 0), "unimplemented zone %d", zone); - if (result != UDS_SUCCESS) { - return result; - } - - map = index_component_data(component); - - result = make_buffer(INDEX_PAGE_MAP_MAGIC_LENGTH + - sizeof(map->last_update), - &buffer); - if (result != UDS_SUCCESS) { - return result; - } - - result = put_bytes(buffer, INDEX_PAGE_MAP_MAGIC_LENGTH, - INDEX_PAGE_MAP_MAGIC); - if (result != UDS_SUCCESS) { - free_buffer(UDS_FORGET(buffer)); - return result; - } - - result = put_uint64_le_into_buffer(buffer, map->last_update); - if (result != UDS_SUCCESS) { - free_buffer(UDS_FORGET(buffer)); - return result; - } - - result = write_to_buffered_writer(writer, get_buffer_contents(buffer), - content_length(buffer)); - free_buffer(UDS_FORGET(buffer)); - if (result != UDS_SUCCESS) { - return uds_log_error_strerror(result, - "cannot write index page map header"); - } - - result = make_buffer(index_page_map_size(map->geometry), &buffer); - if (result != UDS_SUCCESS) { - return result; - } - - result = put_uint16_les_into_buffer(buffer, num_entries(map->geometry), - map->entries); - if (result != UDS_SUCCESS) { - free_buffer(UDS_FORGET(buffer)); - return result; - } - - result = write_to_buffered_writer(writer, get_buffer_contents(buffer), - content_length(buffer)); - free_buffer(UDS_FORGET(buffer)); - if (result != UDS_SUCCESS) { - return uds_log_error_strerror(result, - "cannot write index page map data"); - } - - return UDS_SUCCESS; -} - -/**********************************************************************/ -uint64_t compute_index_page_map_save_size(const struct geometry *geometry) -{ - return index_page_map_size(geometry) + INDEX_PAGE_MAP_MAGIC_LENGTH + - sizeof(((struct index_page_map *) 0)->last_update); -} - -/**********************************************************************/ -static int __must_check decode_index_page_map(struct buffer *buffer, - struct index_page_map *map) -{ - int result = get_uint64_le_from_buffer(buffer, &map->last_update); - if (result != UDS_SUCCESS) { - return result; - } - result = get_uint16_les_from_buffer(buffer, num_entries(map->geometry), - map->entries); - if (result != UDS_SUCCESS) { - return result; - } - result = ASSERT_LOG_ONLY(content_length(buffer) == 0, - "%zu bytes decoded of %zu expected", - buffer_length(buffer) - - content_length(buffer), - buffer_length(buffer)); - return result; -} - -/**********************************************************************/ -static int read_index_page_map(struct read_portal *portal) -{ - struct index_page_map *map = index_component_data(portal->component); - struct buffer *buffer; - struct buffered_reader *reader = NULL; - - int result = get_buffered_reader_for_portal(portal, 0, &reader); - if (result != UDS_SUCCESS) { - return result; - } - - result = verify_buffered_data(reader, INDEX_PAGE_MAP_MAGIC, - INDEX_PAGE_MAP_MAGIC_LENGTH); - if (result != UDS_SUCCESS) { - return uds_log_error_strerror(result, - "bad index page map saved magic"); - } - - result = make_buffer(sizeof(map->last_update) + - index_page_map_size(map->geometry), - &buffer); - if (result != UDS_SUCCESS) { - return result; - } - result = read_from_buffered_reader(reader, get_buffer_contents(buffer), - buffer_length(buffer)); - if (result != UDS_SUCCESS) { - free_buffer(UDS_FORGET(buffer)); - uds_log_error_strerror(result, - "cannot read index page map data"); - return result; - } - - result = reset_buffer_end(buffer, buffer_length(buffer)); - if (result != UDS_SUCCESS) { - free_buffer(UDS_FORGET(buffer)); - return result; - } - - result = decode_index_page_map(buffer, map); - free_buffer(UDS_FORGET(buffer)); - if (result != UDS_SUCCESS) { - return result; - } - uds_log_debug("read index page map, last update %llu", - (unsigned long long) map->last_update); - return UDS_SUCCESS; -} diff --git a/uds/indexPageMap.h b/uds/indexPageMap.h deleted file mode 100644 index c2d68860..00000000 --- a/uds/indexPageMap.h +++ /dev/null @@ -1,156 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/krusty/src/uds/indexPageMap.h#10 $ - */ - -#ifndef INDEX_PAGE_MAP_H -#define INDEX_PAGE_MAP_H 1 - -#include "common.h" -#include "geometry.h" -#include "indexComponent.h" - -extern const struct index_component_info INDEX_PAGE_MAP_INFO; - -struct index_page_bounds { - unsigned int lowest_list; - unsigned int highest_list; -}; - -/* - * Notes on struct index_page_map - * - * Each volume maintains an index page map which records how the chapter delta - * lists are distributed among the index pages for that chapter. - * - * The map is conceptually a two-dimensional array indexed by chapter number - * and index page number within the chapter. Each entry contains the number - * of the last delta list on that index page. In order to save memory, the - * information for the last page in each chapter is not recorded, as it is - * known from the geometry. - */ - -typedef uint16_t index_page_map_entry_t; - -struct index_page_map { - const struct geometry *geometry; - uint64_t last_update; - index_page_map_entry_t *entries; -}; - -/** - * Create an index page map. - * - * @param geometry The geometry governing the index. - * @param map_ptr A pointer to hold the new map. - * - * @return A success or error code. - **/ -int __must_check make_index_page_map(const struct geometry *geometry, - struct index_page_map **map_ptr); - -/** - * Free an index page map. - * - * @param map The index page map to destroy. - **/ -void free_index_page_map(struct index_page_map *map); - -/** - * Get the virtual chapter number of the last update to the index page map. - * - * @param map The index page map - * - * @return the virtual chapter number of the last chapter updated - **/ -uint64_t get_last_update(const struct index_page_map *map); - -/** - * Update an index page map entry. - * - * @param map The map to update - * @param virtual_chapter_number The virtual chapter number being updated. - * @param chapter_number The chapter of the entry to update - * @param index_page_number The index page of the entry to update - * @param delta_list_number The value of the new entry - * - * @return UDS_SUCCESS or an error code - **/ -int __must_check update_index_page_map(struct index_page_map *map, - uint64_t virtual_chapter_number, - unsigned int chapter_number, - unsigned int index_page_number, - unsigned int delta_list_number); - -/** - * Find the page number of the index page in a chapter that will contain the - * chapter index entry for a given chunk name, if it exists. - * - * @param [in] map The map to search - * @param [in] name The chunk name - * @param [in] chapter_number The chapter containing the index page - * @param [out] index_page_number_ptr A pointer to hold the result, guaranteed - * to be a valid index page number on - * UDS_SUCCESS - * - * @return UDS_SUCCESS, or UDS_INVALID_ARGUMENT if the chapter number - * is out of range - **/ -int __must_check find_index_page_number(const struct index_page_map *map, - const struct uds_chunk_name *name, - unsigned int chapter_number, - unsigned int *index_page_number_ptr); - -/** - * Get the lowest and highest numbered delta lists for the given immutable - * chapter index page from the index page map. - * - * @param map The index page map - * @param chapter_number The chapter containing the delta list - * @param index_page_number The index page number within the chapter - * @param bounds A structure to hold the list number bounds - * for the given page - * - * @return UDS_SUCCESS or an error code - **/ -int __must_check get_list_number_bounds(const struct index_page_map *map, - unsigned int chapter_number, - unsigned int index_page_number, - struct index_page_bounds *bounds); - -/** - * Compute the size of the index page map save image, including all headers. - * - * @param geometry The index geometry. - * - * @return The number of bytes required to save the index page map. - **/ -uint64_t compute_index_page_map_save_size(const struct geometry *geometry); - -/** - * Escaped for testing.... - * - * @param geometry The index geometry. - * - * @return The number of bytes required for the page map data, - * exclusive of headers. - **/ -size_t __must_check index_page_map_size(const struct geometry *geometry); - -#endif // INDEX_PAGE_MAP_H diff --git a/uds/indexSession.c b/uds/indexSession.c deleted file mode 100644 index 4ca137b0..00000000 --- a/uds/indexSession.c +++ /dev/null @@ -1,609 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/krusty/src/uds/indexSession.c#57 $ - */ - -#include "indexSession.h" - -#include "indexCheckpoint.h" -#include "logger.h" -#include "memoryAlloc.h" -#include "requestQueue.h" -#include "timeUtils.h" - -/**********************************************************************/ -static void collect_stats(const struct uds_index_session *index_session, - struct uds_index_stats *stats) -{ - const struct session_stats *session_stats = &index_session->stats; - - stats->current_time = - ktime_to_seconds(current_time_ns(CLOCK_REALTIME)); - stats->posts_found = READ_ONCE(session_stats->posts_found); - stats->in_memory_posts_found = - READ_ONCE(session_stats->posts_found_open_chapter); - stats->dense_posts_found = READ_ONCE(session_stats->posts_found_dense); - stats->sparse_posts_found = - READ_ONCE(session_stats->posts_found_sparse); - stats->posts_not_found = READ_ONCE(session_stats->posts_not_found); - stats->updates_found = READ_ONCE(session_stats->updates_found); - stats->updates_not_found = READ_ONCE(session_stats->updates_not_found); - stats->deletions_found = READ_ONCE(session_stats->deletions_found); - stats->deletions_not_found = - READ_ONCE(session_stats->deletions_not_found); - stats->queries_found = READ_ONCE(session_stats->queries_found); - stats->queries_not_found = READ_ONCE(session_stats->queries_not_found); - stats->requests = READ_ONCE(session_stats->requests); -} - -/**********************************************************************/ -static void handle_callbacks(struct uds_request *request) -{ - if (request->status == UDS_SUCCESS) { - // Measure the turnaround time of this request and include that - // time, along with the rest of the request, in the context's - // stat counters. - update_request_context_stats(request); - } - - if (request->callback != NULL) { - // The request has specified its own callback and does not - // expect to be freed. - struct uds_index_session *index_session = request->session; - request->found = - (request->location != UDS_LOCATION_UNAVAILABLE); - request->callback((struct uds_request *) request); - // We do this release after the callback because of the - // contract of the uds_flush_index_session method. - release_index_session(index_session); - } -} - -/**********************************************************************/ -int check_index_session(struct uds_index_session *index_session) -{ - unsigned int state; - uds_lock_mutex(&index_session->request_mutex); - state = index_session->state; - uds_unlock_mutex(&index_session->request_mutex); - - if (state == IS_FLAG_LOADED) { - return UDS_SUCCESS; - } else if (state & IS_FLAG_DISABLED) { - return UDS_DISABLED; - } else if ((state & IS_FLAG_LOADING) || (state & IS_FLAG_SUSPENDED) || - (state & IS_FLAG_WAITING)) { - return -EBUSY; - } - - return UDS_NO_INDEX; -} - -/**********************************************************************/ -int get_index_session(struct uds_index_session *index_session) -{ - int result; - uds_lock_mutex(&index_session->request_mutex); - index_session->request_count++; - uds_unlock_mutex(&index_session->request_mutex); - - result = check_index_session(index_session); - if (result != UDS_SUCCESS) { - release_index_session(index_session); - return result; - } - return UDS_SUCCESS; -} - -/**********************************************************************/ -void release_index_session(struct uds_index_session *index_session) -{ - uds_lock_mutex(&index_session->request_mutex); - if (--index_session->request_count == 0) { - uds_broadcast_cond(&index_session->request_cond); - } - uds_unlock_mutex(&index_session->request_mutex); -} - -/**********************************************************************/ -int start_loading_index_session(struct uds_index_session *index_session) -{ - int result; - uds_lock_mutex(&index_session->request_mutex); - if (index_session->state & IS_FLAG_SUSPENDED) { - uds_log_info("Index session is suspended"); - result = -EBUSY; - } else if (index_session->state != 0) { - uds_log_info("Index is already loaded"); - result = -EBUSY; - } else { - index_session->state |= IS_FLAG_LOADING; - result = UDS_SUCCESS; - } - uds_unlock_mutex(&index_session->request_mutex); - return result; -} - -/**********************************************************************/ -void finish_loading_index_session(struct uds_index_session *index_session, - int result) -{ - uds_lock_mutex(&index_session->request_mutex); - index_session->state &= ~IS_FLAG_LOADING; - if (result == UDS_SUCCESS) { - index_session->state |= IS_FLAG_LOADED; - } - uds_broadcast_cond(&index_session->request_cond); - uds_unlock_mutex(&index_session->request_mutex); -} - -/**********************************************************************/ -void disable_index_session(struct uds_index_session *index_session) -{ - uds_lock_mutex(&index_session->request_mutex); - index_session->state |= IS_FLAG_DISABLED; - uds_unlock_mutex(&index_session->request_mutex); -} - -/**********************************************************************/ -int make_empty_index_session(struct uds_index_session **index_session_ptr) -{ - struct uds_index_session *session; - int result = UDS_ALLOCATE(1, struct uds_index_session, __func__, &session); - if (result != UDS_SUCCESS) { - return result; - } - - result = uds_init_mutex(&session->request_mutex); - if (result != UDS_SUCCESS) { - UDS_FREE(session); - return result; - } - - result = uds_init_cond(&session->request_cond); - if (result != UDS_SUCCESS) { - uds_destroy_mutex(&session->request_mutex); - UDS_FREE(session); - return result; - } - - result = uds_init_mutex(&session->load_context.mutex); - if (result != UDS_SUCCESS) { - uds_destroy_cond(&session->request_cond); - uds_destroy_mutex(&session->request_mutex); - UDS_FREE(session); - return result; - } - - result = uds_init_cond(&session->load_context.cond); - if (result != UDS_SUCCESS) { - uds_destroy_mutex(&session->load_context.mutex); - uds_destroy_cond(&session->request_cond); - uds_destroy_mutex(&session->request_mutex); - UDS_FREE(session); - return result; - } - - result = make_uds_request_queue("callbackW", &handle_callbacks, - &session->callback_queue); - if (result != UDS_SUCCESS) { - uds_destroy_cond(&session->load_context.cond); - uds_destroy_mutex(&session->load_context.mutex); - uds_destroy_cond(&session->request_cond); - uds_destroy_mutex(&session->request_mutex); - UDS_FREE(session); - return result; - } - - *index_session_ptr = session; - return UDS_SUCCESS; -} - -/**********************************************************************/ -static void -wait_for_no_requests_in_progress(struct uds_index_session *index_session) -{ - uds_lock_mutex(&index_session->request_mutex); - while (index_session->request_count > 0) { - uds_wait_cond(&index_session->request_cond, - &index_session->request_mutex); - } - uds_unlock_mutex(&index_session->request_mutex); -} - -/**********************************************************************/ -static int __must_check uds_save_index(struct uds_index_session *index_session) -{ - wait_for_no_requests_in_progress(index_session); - // save_index waits for open chapter writes to complete - return save_index(index_session->index); -} - -/**********************************************************************/ -int uds_suspend_index_session(struct uds_index_session *session, bool save) -{ - int result; - bool flush_index = false; - bool save_index = false; - bool suspend_index = false; - uds_lock_mutex(&session->request_mutex); - // Wait for any pending close operation to complete. - while (session->state & IS_FLAG_CLOSING) { - uds_wait_cond(&session->request_cond, &session->request_mutex); - } - if ((session->state & IS_FLAG_WAITING) || - (session->state & IS_FLAG_DESTROYING)) { - uds_log_info("Index session is already changing state"); - result = -EBUSY; - } else if (session->state & IS_FLAG_SUSPENDED) { - result = UDS_SUCCESS; - } else if (session->state & IS_FLAG_LOADING) { - session->state |= IS_FLAG_WAITING; - suspend_index = true; - result = UDS_SUCCESS; - } else if (!(session->state & IS_FLAG_LOADED)) { - if (session->index != NULL) { - flush_index = true; - session->state |= IS_FLAG_WAITING; - } else { - session->state |= IS_FLAG_SUSPENDED; - uds_broadcast_cond(&session->request_cond); - } - result = UDS_SUCCESS; - } else { - save_index = save; - if (save_index) { - session->state |= IS_FLAG_WAITING; - } else if (session->index != NULL) { - flush_index = true; - session->state |= IS_FLAG_WAITING; - } else { - session->state |= IS_FLAG_SUSPENDED; - uds_broadcast_cond(&session->request_cond); - } - result = UDS_SUCCESS; - } - uds_unlock_mutex(&session->request_mutex); - - if (!save_index && !suspend_index && !flush_index) { - return uds_map_to_system_error(result); - } - - if (flush_index) { - result = uds_flush_index_session(session); - uds_lock_mutex(&session->request_mutex); - session->state &= ~IS_FLAG_WAITING; - session->state |= IS_FLAG_SUSPENDED; - uds_broadcast_cond(&session->request_cond); - uds_unlock_mutex(&session->request_mutex); - return uds_map_to_system_error(result); - } - - if (save_index) { - result = uds_save_index(session); - uds_lock_mutex(&session->request_mutex); - session->state &= ~IS_FLAG_WAITING; - session->state |= IS_FLAG_SUSPENDED; - uds_broadcast_cond(&session->request_cond); - uds_unlock_mutex(&session->request_mutex); - return uds_map_to_system_error(result); - } - - uds_lock_mutex(&session->load_context.mutex); - switch (session->load_context.status) { - case INDEX_OPENING: - session->load_context.status = INDEX_SUSPENDING; - - // Wait until the index indicates that it is not replaying. - while ((session->load_context.status != INDEX_SUSPENDED) && - (session->load_context.status != INDEX_READY)) { - uds_wait_cond(&session->load_context.cond, - &session->load_context.mutex); - } - break; - - case INDEX_READY: - // Index load does not need to be suspended. - break; - - case INDEX_SUSPENDED: - case INDEX_SUSPENDING: - case INDEX_FREEING: - default: - // These cases should not happen. - ASSERT_LOG_ONLY(false, - "Bad load context state %u", - session->load_context.status); - break; - } - uds_unlock_mutex(&session->load_context.mutex); - - uds_lock_mutex(&session->request_mutex); - session->state &= ~IS_FLAG_WAITING; - session->state |= IS_FLAG_SUSPENDED; - uds_broadcast_cond(&session->request_cond); - uds_unlock_mutex(&session->request_mutex); - return UDS_SUCCESS; -} - -/**********************************************************************/ -int uds_resume_index_session(struct uds_index_session *session, - const char *name) -{ - int result = UDS_SUCCESS; - bool no_work = false; - bool resume_replay = false; - - uds_lock_mutex(&session->request_mutex); - if (session->state & IS_FLAG_WAITING) { - uds_log_info("Index session is already changing state"); - no_work = true; - result = -EBUSY; - } else if (!(session->state & IS_FLAG_SUSPENDED)) { - /* If not suspended, just succeed */ - no_work = true; - result = UDS_SUCCESS; - } else { - session->state |= IS_FLAG_WAITING; - if (session->state & IS_FLAG_LOADING) { - resume_replay = true; - } - } - uds_unlock_mutex(&session->request_mutex); - - if (no_work) { - return result; - } - - if ((name != NULL) && (session->index != NULL)) { - result = replace_index_storage(session->index, name); - if (result != UDS_SUCCESS) { - uds_lock_mutex(&session->request_mutex); - session->state &= ~IS_FLAG_WAITING; - uds_broadcast_cond(&session->request_cond); - uds_unlock_mutex(&session->request_mutex); - return uds_map_to_system_error(result); - } - } - - if (resume_replay) { - uds_lock_mutex(&session->load_context.mutex); - switch (session->load_context.status) { - case INDEX_SUSPENDED: - session->load_context.status = INDEX_OPENING; - // Notify the index to start replaying again. - uds_broadcast_cond(&session->load_context.cond); - break; - - case INDEX_READY: - // There is no index rebuild to resume. - break; - - case INDEX_OPENING: - case INDEX_SUSPENDING: - case INDEX_FREEING: - default: - // These cases should not happen; do nothing. - ASSERT_LOG_ONLY(false, - "Bad load context state %u", - session->load_context.status); - break; - } - uds_unlock_mutex(&session->load_context.mutex); - } - - uds_lock_mutex(&session->request_mutex); - session->state &= ~IS_FLAG_WAITING; - session->state &= ~IS_FLAG_SUSPENDED; - uds_broadcast_cond(&session->request_cond); - uds_unlock_mutex(&session->request_mutex); - return UDS_SUCCESS; -} - -/**********************************************************************/ -int save_and_free_index(struct uds_index_session *index_session) -{ - int result = UDS_SUCCESS; - bool suspended; - struct uds_index *index = index_session->index; - if (index != NULL) { - uds_lock_mutex(&index_session->request_mutex); - suspended = (index_session->state & IS_FLAG_SUSPENDED); - uds_unlock_mutex(&index_session->request_mutex); - if (!suspended) { - result = save_index(index); - if (result != UDS_SUCCESS) { - uds_log_warning_strerror(result, - "ignoring error from save_index"); - } - } - free_index(index); - index_session->index = NULL; - - // Reset all index state that happens to be in the index - // session, so it doesn't affect any future index. - uds_lock_mutex(&index_session->load_context.mutex); - index_session->load_context.status = INDEX_OPENING; - uds_unlock_mutex(&index_session->load_context.mutex); - - uds_lock_mutex(&index_session->request_mutex); - // Only the suspend bit will remain relevant. - index_session->state &= IS_FLAG_SUSPENDED; - uds_unlock_mutex(&index_session->request_mutex); - } - - uds_log_debug("Closed index"); - return result; -} - -/**********************************************************************/ -int uds_close_index(struct uds_index_session *index_session) -{ - int result = UDS_SUCCESS; - uds_lock_mutex(&index_session->request_mutex); - - // Wait for any pending suspend, resume or close operations to - // complete. - while ((index_session->state & IS_FLAG_WAITING) || - (index_session->state & IS_FLAG_CLOSING)) { - uds_wait_cond(&index_session->request_cond, - &index_session->request_mutex); - } - - if (index_session->state & IS_FLAG_SUSPENDED) { - uds_log_info("Index session is suspended"); - result = -EBUSY; - } else if ((index_session->state & IS_FLAG_DESTROYING) || - !(index_session->state & IS_FLAG_LOADED)) { - // The index doesn't exist, hasn't finished loading, or is - // being destroyed. - result = UDS_NO_INDEX; - } else { - index_session->state |= IS_FLAG_CLOSING; - } - uds_unlock_mutex(&index_session->request_mutex); - if (result != UDS_SUCCESS) { - return uds_map_to_system_error(result); - } - - uds_log_debug("Closing index"); - wait_for_no_requests_in_progress(index_session); - result = save_and_free_index(index_session); - - uds_lock_mutex(&index_session->request_mutex); - index_session->state &= ~IS_FLAG_CLOSING; - uds_broadcast_cond(&index_session->request_cond); - uds_unlock_mutex(&index_session->request_mutex); - return uds_map_to_system_error(result); -} - -/**********************************************************************/ -int uds_destroy_index_session(struct uds_index_session *index_session) -{ - int result; - bool load_pending = false; - uds_log_debug("Destroying index session"); - - uds_lock_mutex(&index_session->request_mutex); - - // Wait for any pending suspend, resume, or close operations to - // complete. - while ((index_session->state & IS_FLAG_WAITING) || - (index_session->state & IS_FLAG_CLOSING)) { - uds_wait_cond(&index_session->request_cond, - &index_session->request_mutex); - } - - if (index_session->state & IS_FLAG_DESTROYING) { - uds_unlock_mutex(&index_session->request_mutex); - uds_log_info("Index session is already closing"); - return -EBUSY; - } - - index_session->state |= IS_FLAG_DESTROYING; - load_pending = ((index_session->state & IS_FLAG_LOADING) && - (index_session->state & IS_FLAG_SUSPENDED)); - uds_unlock_mutex(&index_session->request_mutex); - - if (load_pending) { - // Tell the index to terminate the rebuild. - uds_lock_mutex(&index_session->load_context.mutex); - if (index_session->load_context.status == INDEX_SUSPENDED) { - index_session->load_context.status = INDEX_FREEING; - uds_broadcast_cond(&index_session->load_context.cond); - } - uds_unlock_mutex(&index_session->load_context.mutex); - - // Wait until the load exits before proceeding. - uds_lock_mutex(&index_session->request_mutex); - while (index_session->state & IS_FLAG_LOADING) { - uds_wait_cond(&index_session->request_cond, - &index_session->request_mutex); - } - uds_unlock_mutex(&index_session->request_mutex); - } - - wait_for_no_requests_in_progress(index_session); - result = save_and_free_index(index_session); - uds_request_queue_finish(index_session->callback_queue); - index_session->callback_queue = NULL; - uds_destroy_cond(&index_session->load_context.cond); - uds_destroy_mutex(&index_session->load_context.mutex); - uds_destroy_cond(&index_session->request_cond); - uds_destroy_mutex(&index_session->request_mutex); - uds_log_debug("Destroyed index session"); - UDS_FREE(index_session); - return uds_map_to_system_error(result); -} - -/**********************************************************************/ -int uds_flush_index_session(struct uds_index_session *index_session) -{ - wait_for_no_requests_in_progress(index_session); - // Wait until any open chapter writes are complete - wait_for_idle_index(index_session->index); - return UDS_SUCCESS; -} - -/**********************************************************************/ -int uds_set_checkpoint_frequency(struct uds_index_session *index_session, - unsigned int frequency) -{ - set_index_checkpoint_frequency(index_session->index->checkpoint, - frequency); - return UDS_SUCCESS; -} - -/**********************************************************************/ -int uds_get_index_configuration(struct uds_index_session *index_session, - struct uds_configuration **conf) -{ - int result; - if (conf == NULL) { - uds_log_error("received a NULL config pointer"); - return -EINVAL; - } - result = UDS_ALLOCATE(1, struct uds_configuration, __func__, conf); - if (result == UDS_SUCCESS) { - **conf = index_session->user_config; - } - return uds_map_to_system_error(result); -} - -/**********************************************************************/ -int uds_get_index_stats(struct uds_index_session *index_session, - struct uds_index_stats *stats) -{ - if (stats == NULL) { - uds_log_error("received a NULL index stats pointer"); - return -EINVAL; - } - - collect_stats(index_session, stats); - if (index_session->index != NULL) { - get_index_stats(index_session->index, stats); - } else { - stats->entries_indexed = 0; - stats->memory_used = 0; - stats->collisions = 0; - stats->entries_discarded = 0; - } - - return UDS_SUCCESS; -} diff --git a/uds/indexSession.h b/uds/indexSession.h deleted file mode 100644 index c55fe53a..00000000 --- a/uds/indexSession.h +++ /dev/null @@ -1,227 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/krusty/src/uds/indexSession.h#22 $ - */ - -#ifndef INDEX_SESSION_H -#define INDEX_SESSION_H - -#include - -#include "config.h" -#include "cpu.h" -#include "uds-threads.h" -#include "uds.h" - -/** - * The bit position of flags used to indicate index session states. - **/ -enum index_session_flag_bit { - IS_FLAG_BIT_START = 8, - /** Flag indicating that the session is loading */ - IS_FLAG_BIT_LOADING = IS_FLAG_BIT_START, - /** Flag indicating that that the session has been loaded */ - IS_FLAG_BIT_LOADED, - /** Flag indicating that the session is disabled permanently */ - IS_FLAG_BIT_DISABLED, - /** Flag indicating that the session is suspended */ - IS_FLAG_BIT_SUSPENDED, - /** Flag indicating that the session is waiting for an index state - change */ - IS_FLAG_BIT_WAITING, - /** Flag indicating that that the session is closing */ - IS_FLAG_BIT_CLOSING, - /** Flag indicating that that the session is being destroyed */ - IS_FLAG_BIT_DESTROYING, -}; - -/** - * The index session state flags. - **/ -enum index_session_flag { - IS_FLAG_LOADED = (1 << IS_FLAG_BIT_LOADED), - IS_FLAG_LOADING = (1 << IS_FLAG_BIT_LOADING), - IS_FLAG_DISABLED = (1 << IS_FLAG_BIT_DISABLED), - IS_FLAG_SUSPENDED = (1 << IS_FLAG_BIT_SUSPENDED), - IS_FLAG_WAITING = (1 << IS_FLAG_BIT_WAITING), - IS_FLAG_CLOSING = (1 << IS_FLAG_BIT_CLOSING), - IS_FLAG_DESTROYING = (1 << IS_FLAG_BIT_DESTROYING), -}; - -struct __attribute__((aligned(CACHE_LINE_BYTES))) session_stats { - uint64_t posts_found; /* Posts that found an entry */ - uint64_t posts_found_open_chapter; /* Posts found in the open - chapter */ - uint64_t posts_found_dense; /* Posts found in the dense index */ - uint64_t posts_found_sparse; /* Posts found in the sparse - index */ - uint64_t posts_not_found; /* Posts that did not find an - entry */ - uint64_t updates_found; /* Updates that found an entry */ - uint64_t updates_not_found; /* Updates that did not find an - entry */ - uint64_t deletions_found; /* Deletes that found an entry */ - uint64_t deletions_not_found; /* Deletes that did not find an - entry */ - uint64_t queries_found; /* Queries that found an entry */ - uint64_t queries_not_found; /* Queries that did not find an - entry */ - uint64_t requests; /* Total number of requests */ -}; - -/** - * States used in the index load context, reflecting the state of the index. - **/ -enum index_suspend_status { - /** The index has not been loaded or rebuilt completely */ - INDEX_OPENING = 0, - /** The index is able to handle requests */ - INDEX_READY, - /** The index has a pending request to suspend */ - INDEX_SUSPENDING, - /** The index is suspended in the midst of a rebuild */ - INDEX_SUSPENDED, - /** The index is being shut down while suspended */ - INDEX_FREEING, -}; - -/** - * The cond_var here must be notified when the status changes to - * INDEX_SUSPENDED, in order to wake up the waiting uds_suspend_index_session() - * call. It must also be notified when the status changes away from - * INDEX_SUSPENDED, to resume rebuild the index from check_for_suspend() in the - * index. - **/ -struct index_load_context { - struct mutex mutex; - struct cond_var cond; - enum index_suspend_status status; // Covered by - // index_load_context.mutex. -}; - -/** - * The request cond_var here must be notified when IS_FLAG_WAITING is cleared, - * in case uds_close_index() or uds_destroy_index_session() is waiting on that - * flag. It must also be notified when IS_FLAG_CLOSING is cleared, in case - * uds_suspend_index_session(), uds_close_index() or - * uds_destroy_index_session() is waiting on that flag. Finally, it must also - * be notified when IS_FLAG_LOADING is cleared, to inform - * uds_destroy_index_session() that the index session can be safely freed. - **/ -struct uds_index_session { - unsigned int state; // Covered by request_mutex. - struct uds_index *index; - struct uds_request_queue *callback_queue; - struct uds_configuration user_config; - struct index_load_context load_context; - // Asynchronous request synchronization - struct mutex request_mutex; - struct cond_var request_cond; - int request_count; - // Request statistics, all owned by the callback thread - struct session_stats stats; -}; - -/** - * Check that the index session is usable. - * - * @param index_session the session to query - * - * @return UDS_SUCCESS or an error code - **/ -int __must_check check_index_session(struct uds_index_session *index_session); - -/** - * Make sure that the index_session is allowed to load an index, and if so, set - * its state to indicate that the load has started. - * - * @param index_session the session to load with - * - * @return UDS_SUCCESS, or an error code if an index already exists. - **/ -int __must_check -start_loading_index_session(struct uds_index_session *index_session); - -/** - * Update the index_session state after attempting to load an index, to - * indicate that the load has completed, and whether or not it succeeded. - * - * @param index_session the session that was loading - * @param result the result of the load operation - **/ -void finish_loading_index_session(struct uds_index_session *index_session, - int result); - -/** - * Disable an index session due to an error. - * - * @param index_session the session to be disabled - **/ -void disable_index_session(struct uds_index_session *index_session); - -/** - * Acquire the index session for an asynchronous index request. - * - * The pointer must eventually be released with a corresponding call to - * release_index_session(). - * - * @param index_session The index session - * - * @return UDS_SUCCESS or an error code - **/ -int __must_check get_index_session(struct uds_index_session *index_session); - -/** - * Release a pointer to an index session. - * - * @param index_session The session to release - **/ -void release_index_session(struct uds_index_session *index_session); - -/** - * Construct a new, empty index session. - * - * @param index_session_ptr The pointer to receive the new session - * - * @return UDS_SUCCESS or an error code - **/ -int __must_check -make_empty_index_session(struct uds_index_session **index_session_ptr); - -/** - * Close the index by saving the underlying index. - * - * @param index_session The index session to be shut down and freed - **/ -int save_and_free_index(struct uds_index_session *index_session); - -/** - * Set the checkpoint frequency of the grid. - * - * @param session The index session to be modified. - * @param frequency New checkpoint frequency. - * - * @return Either UDS_SUCCESS or an error code. - * - **/ -int __must_check -uds_set_checkpoint_frequency(struct uds_index_session *session, - unsigned int frequency); - -#endif /* INDEX_SESSION_H */ diff --git a/uds/indexState.c b/uds/indexState.c deleted file mode 100644 index 6da3f46f..00000000 --- a/uds/indexState.c +++ /dev/null @@ -1,536 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/krusty/src/uds/indexState.c#28 $ - */ - -#include "indexState.h" - -#include "errors.h" -#include "indexComponent.h" -#include "indexLayout.h" -#include "logger.h" -#include "memoryAlloc.h" - - -/**********************************************************************/ -int make_index_state(struct index_layout *layout, - unsigned int num_zones, - unsigned int max_components, - struct index_state **state_ptr) -{ - struct index_state *state = NULL; - int result; - - if (max_components == 0) { - return uds_log_error_strerror(UDS_INVALID_ARGUMENT, - "cannot make index state with max_components 0"); - } - - result = UDS_ALLOCATE_EXTENDED(struct index_state, - max_components, - struct index_component *, - "index state", - &state); - if (result != UDS_SUCCESS) { - return result; - } - - state->count = 0; - state->layout = layout; - state->length = max_components; - state->load_zones = 0; - state->load_slot = UINT_MAX; - state->save_slot = UINT_MAX; - state->saving = false; - state->zone_count = num_zones; - - *state_ptr = state; - return UDS_SUCCESS; -} - -/**********************************************************************/ -void free_index_state(struct index_state *state) -{ - unsigned int i; - - if (state == NULL) { - return; - } - - for (i = 0; i < state->count; ++i) { - free_index_component(UDS_FORGET(state->entries[i])); - } - UDS_FREE(state); -} - -/**********************************************************************/ -/** - * Add a component to the index state. - * - * @param state The index state. - * @param component The index component. - * - * @return UDS_SUCCESS or an error code. - **/ -static int add_component_to_index_state(struct index_state *state, - struct index_component *component) -{ - if (find_index_component(state, component->info) != NULL) { - return uds_log_error_strerror(UDS_INVALID_ARGUMENT, - "cannot add state component %s: already present", - component->info->name); - } - - if (state->count >= state->length) { - return uds_log_error_strerror(UDS_RESOURCE_LIMIT_EXCEEDED, - "cannot add state component %s, %u components already added", - component->info->name, - state->count); - } - - state->entries[state->count] = component; - ++state->count; - return UDS_SUCCESS; -} - -/**********************************************************************/ -int add_index_state_component(struct index_state *state, - const struct index_component_info *info, - void *data, - void *context) -{ - struct index_component *component = NULL; - int result = make_index_component(state, info, state->zone_count, data, - context, &component); - if (result != UDS_SUCCESS) { - return uds_log_error_strerror(result, - "cannot make region index component"); - } - - result = add_component_to_index_state(state, component); - if (result != UDS_SUCCESS) { - free_index_component(component); - return result; - } - return UDS_SUCCESS; -} - -/**********************************************************************/ -struct index_component * -find_index_component(const struct index_state *state, - const struct index_component_info *info) -{ - unsigned int i; - for (i = 0; i < state->count; ++i) { - struct index_component *component = state->entries[i]; - if (info == component->info) { - return component; - } - } - return NULL; -} - -/**********************************************************************/ -static const char *index_save_type_name(enum index_save_type save_type) -{ - return save_type == IS_SAVE ? "save" : "checkpoint"; -} - -/**********************************************************************/ -int load_index_state(struct index_state *state, bool *replay_ptr) -{ - bool replay_required = false; - unsigned int i; - int result = find_latest_uds_index_save_slot(state->layout, - &state->load_zones, - &state->load_slot); - if (result != UDS_SUCCESS) { - return result; - } - - for (i = 0; i < state->count; ++i) { - struct index_component *component = state->entries[i]; - result = read_index_component(component); - if (result != UDS_SUCCESS) { - if (!missing_index_component_requires_replay(component)) { - state->load_zones = 0; - state->load_slot = UINT_MAX; - return uds_log_error_strerror(result, - "index component %s", - index_component_name(component)); - } - replay_required = true; - } - } - - state->load_zones = 0; - state->load_slot = UINT_MAX; - if (replay_ptr != NULL) { - *replay_ptr = replay_required; - } - return UDS_SUCCESS; -} - -/**********************************************************************/ -int prepare_to_save_index_state(struct index_state *state, - enum index_save_type save_type) -{ - int result; - - if (state->saving) { - return uds_log_error_strerror( - UDS_BAD_STATE, "already saving the index state"); - } - result = setup_uds_index_save_slot(state->layout, state->zone_count, - save_type, &state->save_slot); - if (result != UDS_SUCCESS) { - return uds_log_error_strerror(result, - "cannot prepare index %s", - index_save_type_name(save_type)); - } - - return UDS_SUCCESS; -} - -/**********************************************************************/ -/** - * Complete the saving of an index state. - * - * @param state the index state - * - * @return UDS_SUCCESS or an error code - **/ -static int complete_index_saving(struct index_state *state) -{ - int result; - state->saving = false; - result = commit_uds_index_save(state->layout, state->save_slot); - state->save_slot = UINT_MAX; - if (result != UDS_SUCCESS) { - return uds_log_error_strerror(result, - "cannot commit index state"); - } - return UDS_SUCCESS; -} - -/**********************************************************************/ -static int cleanup_save(struct index_state *state) -{ - int result = cancel_uds_index_save(state->layout, state->save_slot); - state->save_slot = UINT_MAX; - if (result != UDS_SUCCESS) { - return uds_log_error_strerror(result, - "cannot cancel index save"); - } - return UDS_SUCCESS; -} - -/**********************************************************************/ -int save_index_state(struct index_state *state) -{ - unsigned int i; - int result = prepare_to_save_index_state(state, IS_SAVE); - if (result != UDS_SUCCESS) { - return result; - } - for (i = 0; i < state->count; ++i) { - struct index_component *component = state->entries[i]; - result = write_index_component(component); - if (result != UDS_SUCCESS) { - cleanup_save(state); - return result; - } - } - return complete_index_saving(state); -} - -/**********************************************************************/ -int write_index_state_checkpoint(struct index_state *state) -{ - unsigned int i; - int result = prepare_to_save_index_state(state, IS_CHECKPOINT); - if (result != UDS_SUCCESS) { - return result; - } - - for (i = 0; i < state->count; ++i) { - struct index_component *component = state->entries[i]; - if (skip_index_component_on_checkpoint(component)) { - continue; - } - result = write_index_component(component); - if (result != UDS_SUCCESS) { - cleanup_save(state); - return result; - } - } - - return complete_index_saving(state); -} - -/**********************************************************************/ -int start_index_state_checkpoint(struct index_state *state) -{ - unsigned int i; - int result = prepare_to_save_index_state(state, IS_CHECKPOINT); - if (result != UDS_SUCCESS) { - return result; - } - - state->saving = true; - - for (i = 0; i < state->count; ++i) { - struct index_component *component = state->entries[i]; - if (skip_index_component_on_checkpoint(component)) { - continue; - } - result = start_index_component_incremental_save(component); - if (result != UDS_SUCCESS) { - abort_index_state_checkpoint(state); - return result; - } - } - - return result; -} - -/**********************************************************************/ -int perform_index_state_checkpoint_chapter_synchronized_saves(struct index_state *state) -{ - unsigned int i; - int result; - if (!state->saving) { - return UDS_SUCCESS; - } - - for (i = 0; i < state->count; ++i) { - struct index_component *component = state->entries[i]; - if (skip_index_component_on_checkpoint(component) || - !defer_index_component_checkpoint_to_chapter_writer(component)) { - continue; - } - result = - perform_index_component_chapter_writer_save(component); - if (result != UDS_SUCCESS) { - return result; - } - } - - return UDS_SUCCESS; -} - -/** - * Wrapper function to do a zone-based checkpoint operation. - * - * @param [in] state the index state - * @param [in] zone the zone number - * @param [in] comp_func the index component function to use - * @param [out] completed if non-NULL, where to save the completion status - * - * @return UDS_SUCCESS or an error code - * - **/ -static int -do_index_state_checkpoint_in_zone(struct index_state *state, - unsigned int zone, - int (*comp_func)(struct index_component *, - unsigned int, - enum completion_status *), - enum completion_status *completed) -{ - enum completion_status status = CS_COMPLETED_PREVIOUSLY; - unsigned int i; - - if (!state->saving) { - if (completed != NULL) { - *completed = CS_COMPLETED_PREVIOUSLY; - } - return UDS_SUCCESS; - } - - for (i = 0; i < state->count; ++i) { - enum completion_status component_status = CS_NOT_COMPLETED; - struct index_component *component = state->entries[i]; - int result; - if (skip_index_component_on_checkpoint(component)) { - continue; - } - if (zone > 0 && !component->info->multi_zone) { - continue; - } - result = (*comp_func)(component, zone, &component_status); - if (result != UDS_SUCCESS) { - return result; - } - // compute rolling least status - if (component_status < status) { - status = component_status; - } - } - - if (completed != NULL) { - *completed = status; - } - return UDS_SUCCESS; -} - -/**********************************************************************/ -int perform_index_state_checkpoint_in_zone(struct index_state *state, - unsigned int zone, - enum completion_status *completed) -{ - return do_index_state_checkpoint_in_zone(state, zone, - &perform_index_component_zone_save, - completed); -} - -/**********************************************************************/ -int finish_index_state_checkpoint_in_zone(struct index_state *state, - unsigned int zone, - enum completion_status *completed) -{ - return do_index_state_checkpoint_in_zone(state, zone, - &finish_index_component_zone_save, - completed); -} - -/**********************************************************************/ -int abort_index_state_checkpoint_in_zone(struct index_state *state, - unsigned int zone, - enum completion_status *completed) -{ - return do_index_state_checkpoint_in_zone(state, zone, - &abort_index_component_zone_save, - completed); -} - -/**********************************************************************/ -int finish_index_state_checkpoint(struct index_state *state) -{ - unsigned int i; - int result; - - if (!state->saving) { - return UDS_SUCCESS; - } - - for (i = 0; i < state->count; ++i) { - struct index_component *component = state->entries[i]; - if (skip_index_component_on_checkpoint(component)) { - continue; - } - result = finish_index_component_incremental_save(component); - if (result != UDS_SUCCESS) { - abort_index_state_checkpoint(state); - return result; - } - } - - result = complete_index_saving(state); - if (result != UDS_SUCCESS) { - return result; - } - - return UDS_SUCCESS; -} - -/**********************************************************************/ -int abort_index_state_checkpoint(struct index_state *state) -{ - int result = UDS_SUCCESS; - unsigned int i; - if (!state->saving) { - return uds_log_error_strerror(UDS_BAD_STATE, - "not saving the index state"); - } - - uds_log_error("aborting index state checkpoint"); - - for (i = 0; i < state->count; ++i) { - int tmp; - struct index_component *component = state->entries[i]; - if (skip_index_component_on_checkpoint(component)) { - continue; - } - tmp = abort_index_component_incremental_save(component); - if (result == UDS_SUCCESS) { - result = tmp; - } - } - - cleanup_save(state); - state->saving = false; - - return result; -} - -/**********************************************************************/ -int discard_index_state_data(struct index_state *state) -{ - int result = discard_uds_index_saves(state->layout, true); - state->save_slot = UINT_MAX; - if (result != UDS_SUCCESS) { - return uds_log_error_strerror(result, - "%s: cannot destroy all index saves", - __func__); - } - return UDS_SUCCESS; -} - -/**********************************************************************/ -int discard_last_index_state_save(struct index_state *state) -{ - int result = discard_uds_index_saves(state->layout, false); - state->save_slot = UINT_MAX; - if (result != UDS_SUCCESS) { - return uds_log_error_strerror(result, - "%s: cannot destroy latest index save", - __func__); - } - return UDS_SUCCESS; -} - -/**********************************************************************/ -struct buffer *get_state_index_state_buffer(struct index_state *state, - enum io_access_mode mode) -{ - unsigned int slot = - mode == IO_READ ? state->load_slot : state->save_slot; - return get_uds_index_state_buffer(state->layout, slot); -} - -/**********************************************************************/ -int open_state_buffered_reader(struct index_state *state, - enum region_kind kind, - unsigned int zone, - struct buffered_reader **reader_ptr) -{ - return open_uds_index_buffered_reader(state->layout, state->load_slot, - kind, zone, reader_ptr); -} - -/**********************************************************************/ -int open_state_buffered_writer(struct index_state *state, - enum region_kind kind, - unsigned int zone, - struct buffered_writer **writer_ptr) -{ - return open_uds_index_buffered_writer(state->layout, state->save_slot, - kind, zone, writer_ptr); -} diff --git a/uds/indexState.h b/uds/indexState.h deleted file mode 100644 index bfbd1fd0..00000000 --- a/uds/indexState.h +++ /dev/null @@ -1,308 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/krusty/src/uds/indexState.h#19 $ - */ - -#ifndef INDEX_STATE_H -#define INDEX_STATE_H 1 - -#include "buffer.h" -#include "indexComponent.h" - - -/** - * Used here and in SingleFileLayout. - **/ -enum index_save_type { - IS_SAVE, - IS_CHECKPOINT, - NO_SAVE = 9999, -}; - -/* - * Used in get_state_index_state_buffer to identify whether the index state - * buffer is for the index being loaded or the index being saved. - */ -enum io_access_mode { - IO_READ = 0x1, - IO_WRITE = 0x2, -}; - -/** - * The index state structure controls the loading and saving of the index - * state. - **/ -struct index_state { - struct index_layout *layout; - unsigned int zone_count; // number of index zones to use - unsigned int load_zones; - unsigned int load_slot; - unsigned int save_slot; - unsigned int count; // count of registered entries - // (<= length) - unsigned int length; // total span of array allocation - bool saving; // incremental save in progress - struct index_component *entries[]; // array of index component entries -}; - -/** - * Make an index state object, - * - * @param [in] layout The index layout. - * @param [in] num_zones The number of zones to use. - * @param [in] max_components The maximum number of components to be handled. - * @param [out] state_ptr Where to store the index state object. - * - * @return UDS_SUCCESS or an error code - **/ -int __must_check make_index_state(struct index_layout *layout, - unsigned int num_zones, - unsigned int max_components, - struct index_state **state_ptr); - -/** - * Free an index state (generically). - * - * @param state The index state to be freed - **/ -void free_index_state(struct index_state *state); - -/** - * Add an index component to an index state. - * - * @param state The index directory in which to add this component. - * @param info The index component file specification. - * @param data The per-component data structure. - * @param context The load/save context of the component. - * - * @return UDS_SUCCESS or an error code. - **/ -int __must_check -add_index_state_component(struct index_state *state, - const struct index_component_info *info, - void *data, - void *context); - -/** - * Load index state - * - * @param state The index state. - * @param replay_ptr If set, the place to hold whether a replay is required. - * - * @return UDS_SUCCESS or error - **/ -int __must_check load_index_state(struct index_state *state, bool *replay_ptr); - -/** - * Save the current index state, including the open chapter. - * - * @param state The index state. - * - * @return UDS_SUCCESS or error - **/ -int __must_check save_index_state(struct index_state *state); - -/** - * Prepare to save the index state. - * - * @param state the index state - * @param save_type whether a checkpoint or save - * - * @return UDS_SUCCESS or an error code - **/ -int __must_check prepare_to_save_index_state(struct index_state *state, - enum index_save_type save_type); - -/** - * Write index checkpoint non-incrementally (for testing). - * - * @param state The index state. - * - * @return UDS_SUCCESS or error - **/ -int __must_check write_index_state_checkpoint(struct index_state *state); - -/** - * Sets up an index state checkpoint which will proceed incrementally. - * May create the directory but does not actually write any data. - * - * @param state The index state. - * - * @return UDS_SUCCESS or an error code. - **/ -int __must_check start_index_state_checkpoint(struct index_state *state); - -/** - * Perform operations on index state checkpoints that are synchronized to - * the chapter writer thread. - * - * @param state The index state. - * - * @return UDS_SUCCESS or an error code. - **/ -int __must_check -perform_index_state_checkpoint_chapter_synchronized_saves(struct index_state *state); - -/** - * Performs zone-specific (and, for zone 0, general) incremental checkpointing. - * - * @param [in] state The index state. - * @param [in] zone The zone number. - * @param [out] completed Set to whether the checkpoint has completed - * for this zone. - * - * @return UDS_SUCCESS or an error code. - **/ -int __must_check -perform_index_state_checkpoint_in_zone(struct index_state *state, - unsigned int zone, - enum completion_status *completed); - -/** - * Force the completion of an incremental index state checkpoint - * for a particular zone. - * - * @param [in] state The index state. - * @param [in] zone The zone number. - * @param [out] completed Set to whether the checkpoint has completed - * for this zone. - * - * @return UDS_SUCCESS or an error code. - **/ -int __must_check -finish_index_state_checkpoint_in_zone(struct index_state *state, - unsigned int zone, - enum completion_status *completed); - -/** - * Force the completion of an incremental index state checkpoint once - * all zones are completed. - * - * @param [in] state The index state. - * - * @return UDS_SUCCESS or an error code. - **/ -int __must_check finish_index_state_checkpoint(struct index_state *state); - -/** - * Aborts an index state checkpoint which is proceeding incrementally - * for a particular zone. - * - * @param [in] state The index state. - * @param [in] zone The zone number. - * @param [out] completed Set to whether the checkpoint has completed or - * aborted for this zone. - * - * @return UDS_SUCCESS or an error code. - **/ -int abort_index_state_checkpoint_in_zone(struct index_state *state, - unsigned int zone, - enum completion_status *completed); - -/** - * Aborts an index state checkpoint which is proceeding incrementally, - * once all the zones are aborted. - * - * @param [in] state The index state. - * - * @return UDS_SUCCESS or an error code. - **/ -int abort_index_state_checkpoint(struct index_state *state); - -/** - * Remove or disable the index state data, for testing. - * - * @param state The index state - * - * @return UDS_SUCCESS or an error code - * - * @note the return value of this function is frequently ignored - **/ -int discard_index_state_data(struct index_state *state); - -/** - * Discard the last index state save, for testing. - * - * @param state The index state - * - * @return UDS_SUCCESS or an error code - * - * @note the return value of this function is frequently ignored - **/ -int discard_last_index_state_save(struct index_state *state); - -/** - * Find index component, for testing. - * - * @param state The index state - * @param info The index component file specification - * - * @return The index component, or NULL if not found - **/ -struct index_component *__must_check -find_index_component(const struct index_state *state, - const struct index_component_info *info); - -/** - * Get the index state buffer for a specified mode. - * - * @param state The index state. - * @param mode One of IO_READ or IO_WRITE. - * - * @return the index state buffer - **/ -struct buffer *__must_check -get_state_index_state_buffer(struct index_state *state, - enum io_access_mode mode); - -/** - * Open a buffered reader for a specified state, kind, and zone. - * This helper function is used by index_component. - * - * @param state The index state. - * @param kind The kind of index save region to open. - * @param zone The zone number for the region. - * @param reader_ptr Where to store the buffered reader. - * - * @return UDS_SUCCESS or an error code. - **/ -int __must_check -open_state_buffered_reader(struct index_state *state, - enum region_kind kind, - unsigned int zone, - struct buffered_reader **reader_ptr); - -/** - * Open a buffered writer for a specified state, kind, and zone. - * This helper function is used by index_component. - * - * @param state The index state. - * @param kind The kind of index save region to open. - * @param zone The zone number for the region. - * @param writer_ptr Where to store the buffered writer. - * - * @return UDS_SUCCESS or an error code. - **/ -int __must_check -open_state_buffered_writer(struct index_state *state, - enum region_kind kind, - unsigned int zone, - struct buffered_writer **writer_ptr); - -#endif // INDEX_STATE_H diff --git a/uds/indexStateData.c b/uds/indexStateData.c deleted file mode 100644 index 6aec0ddb..00000000 --- a/uds/indexStateData.c +++ /dev/null @@ -1,193 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/krusty/src/uds/indexStateData.c#22 $ - */ - -#include "indexStateData.h" - -#include "buffer.h" -#include "errors.h" -#include "index.h" -#include "logger.h" -#include "uds.h" - -/* The index state version header */ -struct index_state_version { - int32_t signature; - int32_t version_id; -}; - -/* The version 301 index state */ -struct index_state_data301 { - uint64_t newest_chapter; - uint64_t oldest_chapter; - uint64_t last_checkpoint; - uint32_t unused; - uint32_t padding; -}; - -static const struct index_state_version INDEX_STATE_VERSION_301 = { - .signature = -1, - .version_id = 301, -}; - -/** - * The index state index component reader. - * - * @param portal the read_portal that handles the read of the component - * - * @return UDS_SUCCESS or an error code - **/ -static int read_index_state_data(struct read_portal *portal) -{ - struct index_state_data301 state; - struct index_state_version file_version; - struct uds_index *index; - struct buffer *buffer = - get_state_index_state_buffer(portal->component->state, - IO_READ); - int result = rewind_buffer(buffer, uncompacted_amount(buffer)); - if (result != UDS_SUCCESS) { - return result; - } - - result = get_int32_le_from_buffer(buffer, &file_version.signature); - if (result != UDS_SUCCESS) { - return result; - } - result = get_int32_le_from_buffer(buffer, &file_version.version_id); - if (result != UDS_SUCCESS) { - return result; - } - - if (file_version.signature != -1 || file_version.version_id != 301) { - return uds_log_error_strerror(UDS_UNSUPPORTED_VERSION, - "index state version %d,%d is unsupported", - file_version.signature, - file_version.version_id); - } - - result = get_uint64_le_from_buffer(buffer, &state.newest_chapter); - if (result != UDS_SUCCESS) { - return result; - } - result = get_uint64_le_from_buffer(buffer, &state.oldest_chapter); - if (result != UDS_SUCCESS) { - return result; - } - result = get_uint64_le_from_buffer(buffer, &state.last_checkpoint); - if (result != UDS_SUCCESS) { - return result; - } - result = get_uint32_le_from_buffer(buffer, &state.unused); - if (result != UDS_SUCCESS) { - return result; - } - result = get_uint32_le_from_buffer(buffer, &state.padding); - if (result != UDS_SUCCESS) { - return result; - } - - if ((state.unused != 0) || (state.padding != 0)) { - return UDS_CORRUPT_COMPONENT; - } - - index = index_component_data(portal->component); - index->newest_virtual_chapter = state.newest_chapter; - index->oldest_virtual_chapter = state.oldest_chapter; - index->last_checkpoint = state.last_checkpoint; - return UDS_SUCCESS; -} - -/** - * The index state index component writer. - * - * @param component The component whose state is to be saved (an index) - * @param writer The buffered writer. - * @param zone The zone to write. - * - * @return UDS_SUCCESS or an error code - **/ -static int -write_index_state_data(struct index_component *component, - struct buffered_writer *writer __always_unused, - unsigned int zone __always_unused) -{ - struct uds_index *index; - struct index_state_data301 state; - struct buffer *buffer = - get_state_index_state_buffer(component->state, IO_WRITE); - int result = reset_buffer_end(buffer, 0); - if (result != UDS_SUCCESS) { - return result; - } - result = put_uint32_le_into_buffer(buffer, - INDEX_STATE_VERSION_301.signature); - if (result != UDS_SUCCESS) { - return result; - } - result = put_uint32_le_into_buffer(buffer, - INDEX_STATE_VERSION_301.version_id); - if (result != UDS_SUCCESS) { - return result; - } - - index = index_component_data(component); - state = (struct index_state_data301) { - .newest_chapter = index->newest_virtual_chapter, - .oldest_chapter = index->oldest_virtual_chapter, - .last_checkpoint = index->last_checkpoint, - }; - - result = put_uint64_le_into_buffer(buffer, state.newest_chapter); - if (result != UDS_SUCCESS) { - return result; - } - result = put_uint64_le_into_buffer(buffer, state.oldest_chapter); - if (result != UDS_SUCCESS) { - return result; - } - result = put_uint64_le_into_buffer(buffer, state.last_checkpoint); - if (result != UDS_SUCCESS) { - return result; - } - result = put_uint32_le_into_buffer(buffer, state.unused); - if (result != UDS_SUCCESS) { - return result; - } - result = put_uint32_le_into_buffer(buffer, state.padding); - if (result != UDS_SUCCESS) { - return result; - } - return UDS_SUCCESS; -} - -/**********************************************************************/ - -const struct index_component_info INDEX_STATE_INFO = { - .kind = RL_KIND_INDEX_STATE, - .name = "index state", - .save_only = false, - .chapter_sync = true, - .multi_zone = false, - .io_storage = false, - .loader = read_index_state_data, - .saver = write_index_state_data, - .incremental = NULL, -}; diff --git a/uds/indexStateData.h b/uds/indexStateData.h deleted file mode 100644 index 02028817..00000000 --- a/uds/indexStateData.h +++ /dev/null @@ -1,29 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/krusty/src/uds/indexStateData.h#4 $ - */ - -#ifndef INDEX_STATE_DATA_H -#define INDEX_STATE_DATA_H 1 - -#include "indexComponent.h" - -extern const struct index_component_info INDEX_STATE_INFO; - -#endif /* not INDEX_STATE_DATA_H */ diff --git a/uds/indexZone.c b/uds/indexZone.c deleted file mode 100644 index 80fa4dc2..00000000 --- a/uds/indexZone.c +++ /dev/null @@ -1,418 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/krusty/src/uds/indexZone.c#43 $ - */ - -#include "indexZone.h" - -#include "errors.h" -#include "index.h" -#include "indexCheckpoint.h" -#include "logger.h" -#include "memoryAlloc.h" -#include "permassert.h" -#include "request.h" -#include "sparseCache.h" -#include "uds.h" - -/**********************************************************************/ -int make_index_zone(struct uds_index *index, unsigned int zone_number) -{ - struct index_zone *zone; - int result = UDS_ALLOCATE(1, struct index_zone, "index zone", &zone); - if (result != UDS_SUCCESS) { - return result; - } - - result = make_open_chapter(index->volume->geometry, - index->zone_count, - &zone->open_chapter); - if (result != UDS_SUCCESS) { - free_index_zone(zone); - return result; - } - - result = make_open_chapter(index->volume->geometry, - index->zone_count, - &zone->writing_chapter); - if (result != UDS_SUCCESS) { - free_index_zone(zone); - return result; - } - - zone->index = index; - zone->id = zone_number; - index->zones[zone_number] = zone; - - return UDS_SUCCESS; -} - -/**********************************************************************/ -void free_index_zone(struct index_zone *zone) -{ - if (zone == NULL) { - return; - } - - free_open_chapter(zone->open_chapter); - free_open_chapter(zone->writing_chapter); - UDS_FREE(zone); -} - -/**********************************************************************/ -bool is_zone_chapter_sparse(const struct index_zone *zone, - uint64_t virtual_chapter) -{ - return is_chapter_sparse(zone->index->volume->geometry, - zone->oldest_virtual_chapter, - zone->newest_virtual_chapter, - virtual_chapter); -} - -/**********************************************************************/ -void set_active_chapters(struct index_zone *zone) -{ - zone->oldest_virtual_chapter = zone->index->oldest_virtual_chapter; - zone->newest_virtual_chapter = zone->index->newest_virtual_chapter; -} - -/** - * Swap the open and writing chapters after blocking until there are no active - * chapter writers on the index. - * - * @param zone The zone swapping chapters - * - * @return UDS_SUCCESS or a return code - **/ -static int swap_open_chapter(struct index_zone *zone) -{ - struct open_chapter_zone *temp_chapter; - // Wait for any currently writing chapter to complete - int result = finish_previous_chapter(zone->index->chapter_writer, - zone->newest_virtual_chapter); - if (result != UDS_SUCCESS) { - return result; - } - - // Swap the writing and open chapters - temp_chapter = zone->open_chapter; - zone->open_chapter = zone->writing_chapter; - zone->writing_chapter = temp_chapter; - return UDS_SUCCESS; -} - -/** - * Advance to a new open chapter, and forget the oldest chapter in the - * index if necessary. - * - * @param zone The zone containing the chapter to reap - * - * @return UDS_SUCCESS or an error code - **/ -static int reap_oldest_chapter(struct index_zone *zone) -{ - struct uds_index *index = zone->index; - unsigned int chapters_per_volume = - index->volume->geometry->chapters_per_volume; - int result = - ASSERT(((zone->newest_virtual_chapter - - zone->oldest_virtual_chapter) <= chapters_per_volume), - "newest (%llu) and oldest (%llu) virtual chapters less than or equal to chapters per volume (%u)", - (unsigned long long) zone->newest_virtual_chapter, - (unsigned long long) zone->oldest_virtual_chapter, - chapters_per_volume); - if (result != UDS_SUCCESS) { - return result; - } - - set_volume_index_zone_open_chapter(index->volume_index, zone->id, - zone->newest_virtual_chapter); - return UDS_SUCCESS; -} - -/** - * Handle notification that some other zone has closed its open chapter. If - * the chapter that was closed is still the open chapter for this zone, - * close it now in order to minimize skew. - * - * @param zone The zone receiving the notification - * @param virtual_chapter The closed virtual chapter - * - * @return UDS_SUCCESS or an error code - **/ -static int handle_chapter_closed(struct index_zone *zone, - uint64_t virtual_chapter) -{ - if (zone->newest_virtual_chapter == virtual_chapter) { - return open_next_chapter(zone, NULL); - } - - return UDS_SUCCESS; -} - -/**********************************************************************/ -int dispatch_index_zone_control_request(struct uds_request *request) -{ - struct uds_zone_message *message = &request->zone_message; - struct index_zone *zone = message->index->zones[request->zone_number]; - - switch (message->type) { - case UDS_MESSAGE_SPARSE_CACHE_BARRIER: - return update_sparse_cache(zone, message->virtual_chapter); - - case UDS_MESSAGE_ANNOUNCE_CHAPTER_CLOSED: - return handle_chapter_closed(zone, message->virtual_chapter); - - default: - uds_log_error("invalid message type: %d", message->type); - return UDS_INVALID_ARGUMENT; - } -} - -/** - * Announce the closure of the current open chapter to the other zones. - * - * @param request The request which caused the chapter to close - * (may be NULL) - * @param zone The zone which first closed the chapter - * @param closed_chapter The chapter which was closed - * - * @return UDS_SUCCESS or an error code - **/ -static int announce_chapter_closed(struct uds_request *request, - struct index_zone *zone, - uint64_t closed_chapter) -{ - struct uds_index *index = - ((request != NULL) ? request->index : NULL); - - struct uds_zone_message zone_message = { - .type = UDS_MESSAGE_ANNOUNCE_CHAPTER_CLOSED, - .index = zone->index, - .virtual_chapter = closed_chapter, - }; - - unsigned int i; - for (i = 0; i < zone->index->zone_count; i++) { - int result; - if (zone->id == i) { - continue; - } - if (index != NULL) { - result = launch_zone_message(zone_message, i, index); - } else { - // We're in a test which doesn't have zone queues, so - // we can just call the message function directly. - result = handle_chapter_closed(zone->index->zones[i], - closed_chapter); - } - if (result != UDS_SUCCESS) { - return result; - } - } - - return UDS_SUCCESS; -} - -/**********************************************************************/ -int open_next_chapter(struct index_zone *zone, struct uds_request *request) -{ - uint64_t closed_chapter, victim; - int result; - unsigned int finished_zones; - unsigned int expired_chapters; - uds_log_debug("closing chapter %llu of zone %u after %u entries (%u short)", - (unsigned long long) zone->newest_virtual_chapter, - zone->id, - zone->open_chapter->size, - zone->open_chapter->capacity - zone->open_chapter->size); - - result = swap_open_chapter(zone); - if (result != UDS_SUCCESS) { - return result; - } - - closed_chapter = zone->newest_virtual_chapter++; - result = reap_oldest_chapter(zone); - if (result != UDS_SUCCESS) { - return uds_log_error_strerror(result, - "reap_oldest_chapter failed"); - } - - reset_open_chapter(zone->open_chapter); - - // begin, continue, or finish the checkpoint processing - // moved above start_closing_chapter because some of the - // checkpoint processing now done by the chapter writer thread - result = process_checkpointing(zone->index, zone->id, - zone->newest_virtual_chapter); - if (result != UDS_SUCCESS) { - return result; - } - - finished_zones = - start_closing_chapter(zone->index->chapter_writer, zone->id, - zone->writing_chapter); - if ((finished_zones == 1) && (zone->index->zone_count > 1)) { - // This is the first zone of a multi-zone index to close this - // chapter, so inform the other zones in order to control zone - // skew. - result = - announce_chapter_closed(request, zone, closed_chapter); - if (result != UDS_SUCCESS) { - return result; - } - } - - victim = zone->oldest_virtual_chapter; - expired_chapters = chapters_to_expire(zone->index->volume->geometry, - zone->newest_virtual_chapter); - zone->oldest_virtual_chapter += expired_chapters; - - if (finished_zones < zone->index->zone_count) { - // We are not the last zone to close the chapter, so we're done - return UDS_SUCCESS; - } - - /* - * We are the last zone to close the chapter, so clean up the cache. - * That it is safe to let the last thread out of the previous chapter - * to do this relies on the fact that although the new open chapter - * shadows the oldest chapter in the cache, until we write the new open - * chapter to disk, we'll never look for it in the cache. - */ - while ((expired_chapters-- > 0) && (result == UDS_SUCCESS)) { - result = forget_chapter(zone->index->volume, victim++, - INVALIDATION_EXPIRE); - } - - return result; -} - -/**********************************************************************/ -enum uds_index_region compute_index_region(const struct index_zone *zone, - uint64_t virtual_chapter) -{ - if (virtual_chapter == zone->newest_virtual_chapter) { - return UDS_LOCATION_IN_OPEN_CHAPTER; - } - if (is_zone_chapter_sparse(zone, virtual_chapter)) { - return UDS_LOCATION_IN_SPARSE; - } - return UDS_LOCATION_IN_DENSE; -} - -/**********************************************************************/ -int get_record_from_zone(struct index_zone *zone, - struct uds_request *request, - bool *found, - uint64_t virtual_chapter) -{ - struct volume *volume; - if (virtual_chapter == zone->newest_virtual_chapter) { - search_open_chapter(zone->open_chapter, - &request->chunk_name, - &request->old_metadata, - found); - return UDS_SUCCESS; - } - - if ((zone->newest_virtual_chapter > 0) && - (virtual_chapter == (zone->newest_virtual_chapter - 1)) && - (zone->writing_chapter->size > 0)) { - // Only search the writing chapter if it is full, else look on - // disk. - search_open_chapter(zone->writing_chapter, - &request->chunk_name, - &request->old_metadata, - found); - return UDS_SUCCESS; - } - - // We have determined the location previously. - if (request->location != UDS_LOCATION_UNKNOWN) { - *found = (request->location != UDS_LOCATION_UNAVAILABLE); - return UDS_SUCCESS; - } - - volume = zone->index->volume; - if (is_zone_chapter_sparse(zone, virtual_chapter) && - sparse_cache_contains(volume->sparse_cache, - virtual_chapter, - request->zone_number)) { - // The named chunk, if it exists, is in a sparse chapter that - // is cached, so just run the chunk through the sparse chapter - // cache search. - return search_sparse_cache_in_zone(zone, request, - virtual_chapter, found); - } - - return search_volume_page_cache(volume, - request, &request->chunk_name, - virtual_chapter, - &request->old_metadata, found); -} - -/**********************************************************************/ -int put_record_in_zone(struct index_zone *zone, - struct uds_request *request, - const struct uds_chunk_data *metadata) -{ - unsigned int remaining; - int result = put_open_chapter(zone->open_chapter, &request->chunk_name, - metadata, &remaining); - if (result != UDS_SUCCESS) { - return result; - } - - if (remaining == 0) { - return open_next_chapter(zone, request); - } - - return UDS_SUCCESS; -} - -/**********************************************************************/ -int search_sparse_cache_in_zone(struct index_zone *zone, - struct uds_request *request, - uint64_t virtual_chapter, - bool *found) -{ - struct volume *volume; - int record_page_number; - unsigned int chapter; - int result = search_sparse_cache(zone, - &request->chunk_name, - &virtual_chapter, - &record_page_number); - if ((result != UDS_SUCCESS) || (virtual_chapter == UINT64_MAX)) { - return result; - } - - volume = zone->index->volume; - // XXX map to physical chapter and validate. It would be nice to just - // pass the virtual in to the slow lane, since it's tracking - // invalidations. - chapter = map_to_physical_chapter(volume->geometry, virtual_chapter); - - return search_cached_record_page(volume, - request, &request->chunk_name, - chapter, record_page_number, - &request->old_metadata, found); -} diff --git a/uds/indexZone.h b/uds/indexZone.h deleted file mode 100644 index 129498ab..00000000 --- a/uds/indexZone.h +++ /dev/null @@ -1,158 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/krusty/src/uds/indexZone.h#18 $ - */ - -#ifndef INDEX_ZONE_H -#define INDEX_ZONE_H - -#include "common.h" -#include "openChapterZone.h" -#include "request.h" - -struct index_zone { - struct uds_index *index; - struct open_chapter_zone *open_chapter; - struct open_chapter_zone *writing_chapter; - uint64_t oldest_virtual_chapter; - uint64_t newest_virtual_chapter; - unsigned int id; -}; - -/** - * Allocate an index zone. - * - * @param index The index receiving the zone - * @param zone_number The number of the zone to allocate - * - * @return UDS_SUCCESS or an error code. - **/ -int __must_check make_index_zone(struct uds_index *index, - unsigned int zone_number); - -/** - * Clean up an index zone. - * - * @param zone The index zone to free - **/ -void free_index_zone(struct index_zone *zone); - -/** - * Check whether a chapter is sparse or dense based on the current state of - * the index zone. - * - * @param zone The index zone to check against - * @param virtual_chapter The virtual chapter number of the chapter to check - * - * @return true if the chapter is in the sparse part of the volume - **/ -bool __must_check is_zone_chapter_sparse(const struct index_zone *zone, - uint64_t virtual_chapter); - -/** - * Set the active chapter numbers for a zone based on its index. The active - * chapters consist of the range of chapters from the current oldest to - * the current newest virtual chapter. - * - * @param zone The zone to set - **/ -void set_active_chapters(struct index_zone *zone); - -/** - * Dispatch a control request to an index zone. - * - * @param request The request to dispatch - * - * @return UDS_SUCCESS or an error code - **/ -int __must_check -dispatch_index_zone_control_request(struct uds_request *request); - -/** - * Open the next chapter. - * - * @param zone The zone containing the open chapter - * @param request The request which requires the next chapter to be - * opened - * - * @return UDS_SUCCESS if successful. - **/ -int __must_check open_next_chapter(struct index_zone *zone, - struct uds_request *request); - -/** - * Determine the index region in which a block was found. - * - * @param zone The zone that was searched - * @param virtual_chapter The virtual chapter number - * - * @return the index region of the chapter in which the block was found - **/ -enum uds_index_region compute_index_region(const struct index_zone *zone, - uint64_t virtual_chapter); - -/** - * Get a record from either the volume or the open chapter in a zone. - * - * @param zone The index zone to query - * @param request The request originating the query - * @param found A pointer to a bool which will be set to - * true if the record was found. - * @param virtual_chapter The chapter in which to search - * - * @return UDS_SUCCESS or an error code - **/ -int __must_check get_record_from_zone(struct index_zone *zone, - struct uds_request *request, - bool *found, - uint64_t virtual_chapter); - -/** - * Put a record in the open chapter. If this fills the chapter, the chapter - * will be closed and a new one will be opened. - * - * @param zone The index zone containing the chapter - * @param request The request containing the name of the record - * @param metadata The record metadata - * - * @return UDS_SUCCESS or an error - **/ -int __must_check put_record_in_zone(struct index_zone *zone, - struct uds_request *request, - const struct uds_chunk_data *metadata); - -/** - * Search the cached sparse chapter index, either for a cached sparse hook, or - * as the last chance for finding the record named by a request. - * - * @param [in] zone the index zone - * @param [in] request the request originating the search - * @param [in] virtual_chapter if UINT64_MAX, search the entire cache; - * otherwise search this chapter, if cached - * @param [out] found A pointer to a bool which will be set to - * true if the record was found - * - * @return UDS_SUCCESS or an error code - **/ -int __must_check search_sparse_cache_in_zone(struct index_zone *zone, - struct uds_request *request, - uint64_t virtual_chapter, - bool *found); - -#endif /* INDEX_ZONE_H */ diff --git a/uds/layoutRegion.h b/uds/layoutRegion.h deleted file mode 100644 index 4c38804a..00000000 --- a/uds/layoutRegion.h +++ /dev/null @@ -1,57 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/krusty/src/uds/layoutRegion.h#8 $ - */ - -#ifndef LAYOUT_REGION_H -#define LAYOUT_REGION_H - -/** - * Single file layouts are defined in terms of data regions. Each data region - * is a sub-section of the available space. Some data regions may contain - * subsidiary data regions, for example, a checkpoint or index save will - * contain volume index regions (according to the number of zones), an - * index page map region, and possibly an open chapter region. - **/ - -static const uint64_t REGION_MAGIC = 0x416c6252676e3031; // 'AlbRgn01' - -struct region_header { - uint64_t magic; // REGION_MAGIC - uint64_t region_blocks; // size of whole region - uint16_t type; // RH_TYPE_... - uint16_t version; // 1 - uint16_t num_regions; // number of layouts in the table - uint16_t payload; // extra data beyond region table -}; - -struct layout_region { - uint64_t start_block; - uint64_t num_blocks; - uint32_t checksum; // only used for save regions - uint16_t kind; - uint16_t instance; -}; - -struct region_table { - struct region_header header; - struct layout_region regions[]; -}; - -#endif // LAYOUT_REGION_H diff --git a/uds/loadType.c b/uds/loadType.c deleted file mode 100644 index f63b1399..00000000 --- a/uds/loadType.c +++ /dev/null @@ -1,39 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/krusty/src/uds/loadType.c#6 $ - */ - -#include "loadType.h" - -#include "logger.h" - -/**********************************************************************/ -const char *get_load_type(enum load_type load_type) -{ - switch (load_type) { - case LOAD_CREATE: - return "creating index"; - case LOAD_LOAD: - return "loading index"; - case LOAD_REBUILD: - return "loading or rebuilding index"; - default: - return "no load method specified"; - } -} diff --git a/uds/loadType.h b/uds/loadType.h deleted file mode 100644 index f4d78983..00000000 --- a/uds/loadType.h +++ /dev/null @@ -1,62 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/krusty/src/uds/loadType.h#7 $ - */ - -#ifndef LOAD_TYPE_H -#define LOAD_TYPE_H - -/** - * Methods of starting the index. (Keep get_load_type() in sync.) - * - * Usage number 1 is to note the interface method that initiates loading the - * index. As in this table: - * - * name type opened by - * =========== ====== ==================== - * LOAD_CREATE local udsCreateLocalIndex - * LOAD_LOAD local udsLoadLocalIndex - * LOAD_REBUILD local udsRebuildLocalIndex - * - * Usage number 2 is to record how an index was really opened. As in this - * table: - * - * LOAD_CREATE new empty index - * LOAD_LOAD loaded saved index - * LOAD_REPLAY loaded checkpoint and replayed new chapters - * LOAD_EMPTY empty volume index from empty volume data - * LOAD_REBUILD rebuilt volume index from volume data - **/ -enum load_type { - LOAD_UNDEFINED = 0, - LOAD_CREATE, - LOAD_LOAD, - LOAD_REBUILD, - LOAD_EMPTY, - LOAD_REPLAY, -}; - -/** - * get a string indicating how an index is to be loaded. - * - * @param load_type The load type to log - **/ -const char *get_load_type(enum load_type load_type); - -#endif /* LOAD_TYPE_H */ diff --git a/uds/logger.c b/uds/logger.c deleted file mode 100644 index a2906567..00000000 --- a/uds/logger.c +++ /dev/null @@ -1,145 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/krusty/src/uds/logger.c#25 $ - */ - -#include "logger.h" - -#include "common.h" -#include "errors.h" -#include "stringUtils.h" -#include "uds-threads.h" -#include "uds.h" - -typedef struct { - const char *name; - const int priority; -} PriorityName; - -static const PriorityName PRIORITIES[] = { - { "ALERT", UDS_LOG_ALERT }, - { "CRITICAL", UDS_LOG_CRIT }, - { "CRIT", UDS_LOG_CRIT }, - { "DEBUG", UDS_LOG_DEBUG }, - { "EMERGENCY", UDS_LOG_EMERG }, - { "EMERG", UDS_LOG_EMERG }, - { "ERROR", UDS_LOG_ERR }, - { "ERR", UDS_LOG_ERR }, - { "INFO", UDS_LOG_INFO }, - { "NOTICE", UDS_LOG_NOTICE }, - { "PANIC", UDS_LOG_EMERG }, - { "WARN", UDS_LOG_WARNING }, - { "WARNING", UDS_LOG_WARNING }, - { NULL, -1 }, -}; - -static const char *const PRIORITY_STRINGS[] = { - "EMERGENCY", - "ALERT", - "CRITICAL", - "ERROR", - "WARN", - "NOTICE", - "INFO", - "DEBUG", -}; - -static int log_level = UDS_LOG_INFO; - -/**********************************************************************/ -int get_uds_log_level(void) -{ - return log_level; -} - -/**********************************************************************/ -void set_uds_log_level(int new_log_level) -{ - log_level = new_log_level; -} - -/**********************************************************************/ -int uds_log_string_to_priority(const char *string) -{ - int i; - for (i = 0; PRIORITIES[i].name != NULL; i++) { - if (strcasecmp(string, PRIORITIES[i].name) == 0) { - return PRIORITIES[i].priority; - } - } - return UDS_LOG_INFO; -} - -/**********************************************************************/ -const char *uds_log_priority_to_string(int priority) -{ - if ((priority < 0) || (priority >= (int) COUNT_OF(PRIORITY_STRINGS))) { - return "unknown"; - } - return PRIORITY_STRINGS[priority]; -} - -/**********************************************************************/ -void uds_log_embedded_message(int priority, - const char *module, - const char *prefix, - const char *fmt1, - va_list args1, - const char *fmt2, - ...) -{ - va_list ap; - va_start(ap, fmt2); - uds_log_message_pack(priority, module, prefix, fmt1, args1, fmt2, ap); - va_end(ap); -} - -/**********************************************************************/ -int uds_vlog_strerror(int priority, - int errnum, - const char *module, - const char *format, - va_list args) -{ - char errbuf[ERRBUF_SIZE]; - uds_log_embedded_message(priority, - module, - NULL, - format, - args, - ": %s (%u)", - string_error(errnum, errbuf, sizeof(errbuf)), - errnum); - return errnum; -} - -/**********************************************************************/ -int __uds_log_strerror(int priority, - int errnum, - const char *module, - const char *format, - ...) -{ - va_list args; - - va_start(args, format); - uds_vlog_strerror(priority, errnum, module, format, args); - va_end(args); - return errnum; -} diff --git a/uds/memoryAlloc.c b/uds/memoryAlloc.c deleted file mode 100644 index 353a3d10..00000000 --- a/uds/memoryAlloc.c +++ /dev/null @@ -1,46 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/krusty/src/uds/memoryAlloc.c#5 $ - */ - -#include "memoryAlloc.h" - -#include "stringUtils.h" - -/**********************************************************************/ -int uds_duplicate_string(const char *string, const char *what, - char **new_string) -{ - return uds_memdup(string, strlen(string) + 1, what, new_string); -} - -/**********************************************************************/ -int uds_memdup(const void *buffer, size_t size, const char *what, - void *dup_ptr) -{ - byte *dup; - int result = UDS_ALLOCATE(size, byte, what, &dup); - if (result != UDS_SUCCESS) { - return result; - } - - memcpy(dup, buffer, size); - *((void **) dup_ptr) = dup; - return UDS_SUCCESS; -} diff --git a/uds/murmur/MurmurHash3.c b/uds/murmur/MurmurHash3.c deleted file mode 100644 index 6ab21161..00000000 --- a/uds/murmur/MurmurHash3.c +++ /dev/null @@ -1,381 +0,0 @@ -//----------------------------------------------------------------------------- -// MurmurHash3 was written by Austin Appleby, and is placed in the public -// domain. The author hereby disclaims copyright to this source code. - -// Note - The x86 and x64 versions do _not_ produce the same results, as the -// algorithms are optimized for their respective platforms. You can still -// compile and run any of them on any platform, but your performance with the -// non-native version will be less than optimal. - -#include "MurmurHash3.h" - -#include "cpu.h" - -//----------------------------------------------------------------------------- -// Platform-specific functions and macros - -// Microsoft Visual Studio - -#if defined(_MSC_VER) - -#define FORCE_INLINE __forceinline - -#include - -#define ROTL32(x,y) _rotl(x,y) -#define ROTL64(x,y) _rotl64(x,y) - -#define BIG_CONSTANT(x) (x) - -// Other compilers - -#else // defined(_MSC_VER) - -#if __GNUC__ >= 7 -#pragma GCC diagnostic warning "-Wimplicit-fallthrough=0" -#endif - -#define FORCE_INLINE __attribute__((always_inline)) inline - -static inline uint32_t rotl32 ( uint32_t x, int8_t r ) -{ - return (x << r) | (x >> (32 - r)); -} - -static inline uint64_t rotl64 ( uint64_t x, int8_t r ) -{ - return (x << r) | (x >> (64 - r)); -} - -#define ROTL32(x,y) rotl32(x,y) -#define ROTL64(x,y) rotl64(x,y) - -#define BIG_CONSTANT(x) (x##LLU) - -#endif // !defined(_MSC_VER) - -//----------------------------------------------------------------------------- -// Block read - if your platform needs to do endian-swapping or can only -// handle aligned reads, do the conversion here - -static FORCE_INLINE uint32_t getblock ( const uint32_t * p, int i ) -{ -#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ - return p[i]; -#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ - return __builtin_bswap32(p[i]); -#else -#error "can't figure out byte order" -#endif -} - -static FORCE_INLINE uint64_t getblock64 ( const uint64_t * p, int i ) -{ -#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ - return p[i]; -#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ - return __builtin_bswap64(p[i]); -#else -#error "can't figure out byte order" -#endif -} - -// Block write -static FORCE_INLINE void putblock (uint32_t *p, int i, uint32_t value) -{ -#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ - p[i] = value; -#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ - p[i] = __builtin_bswap32(value); -#else -#error "can't figure out byte order" -#endif -} - -static FORCE_INLINE void putblock64 (uint64_t *p, int i, uint64_t value) -{ -#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ - p[i] = value; -#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ - p[i] = __builtin_bswap64(value); -#else -#error "can't figure out byte order" -#endif -} - -//----------------------------------------------------------------------------- -// Finalization mix - force all bits of a hash block to avalanche - -static FORCE_INLINE uint32_t fmix32 ( uint32_t h ) -{ - h ^= h >> 16; - h *= 0x85ebca6b; - h ^= h >> 13; - h *= 0xc2b2ae35; - h ^= h >> 16; - - return h; -} - -//---------- - -static FORCE_INLINE uint64_t fmix64 ( uint64_t k ) -{ - k ^= k >> 33; - k *= BIG_CONSTANT(0xff51afd7ed558ccd); - k ^= k >> 33; - k *= BIG_CONSTANT(0xc4ceb9fe1a85ec53); - k ^= k >> 33; - - return k; -} - -//----------------------------------------------------------------------------- - -void MurmurHash3_x86_32 ( const void * key, int len, - uint32_t seed, void * out ) -{ - const uint8_t * data = (const uint8_t*)key; - const int nblocks = len / 4; - - uint32_t h1 = seed; - - uint32_t c1 = 0xcc9e2d51; - uint32_t c2 = 0x1b873593; - - const uint8_t *tail; - uint32_t k1 = 0; - - //---------- - // body - - const uint32_t * blocks = (const uint32_t *)(data + nblocks*4); - - int i; - for(i = -nblocks; i; i++) - { - uint32_t k1 = getblock(blocks,i); - - k1 *= c1; - k1 = ROTL32(k1,15); - k1 *= c2; - - h1 ^= k1; - h1 = ROTL32(h1,13); - h1 = h1*5+0xe6546b64; - } - - //---------- - // tail - - tail = (const uint8_t*)(data + nblocks*4); - - switch(len & 3) - { - case 3: k1 ^= tail[2] << 16; - case 2: k1 ^= tail[1] << 8; - case 1: k1 ^= tail[0]; - k1 *= c1; k1 = ROTL32(k1,15); k1 *= c2; h1 ^= k1; - default: break; - }; - - //---------- - // finalization - - h1 ^= len; - - h1 = fmix32(h1); - - putblock(out, 0, h1); -} - -//----------------------------------------------------------------------------- - -void MurmurHash3_x86_128 ( const void * key, const int len, - uint32_t seed, void * out ) -{ - const uint8_t * data = (const uint8_t*)key; - const int nblocks = len / 16; - - uint32_t h1 = seed; - uint32_t h2 = seed; - uint32_t h3 = seed; - uint32_t h4 = seed; - - uint32_t c1 = 0x239b961b; - uint32_t c2 = 0xab0e9789; - uint32_t c3 = 0x38b34ae5; - uint32_t c4 = 0xa1e38b93; - - const uint8_t *tail; - uint32_t k1 = 0; - uint32_t k2 = 0; - uint32_t k3 = 0; - uint32_t k4 = 0; - - //---------- - // body - - const uint32_t * blocks = (const uint32_t *)(data + nblocks*16); - - int i; - for(i = -nblocks; i; i++) - { - uint32_t k1 = getblock(blocks,i*4+0); - uint32_t k2 = getblock(blocks,i*4+1); - uint32_t k3 = getblock(blocks,i*4+2); - uint32_t k4 = getblock(blocks,i*4+3); - - k1 *= c1; k1 = ROTL32(k1,15); k1 *= c2; h1 ^= k1; - - h1 = ROTL32(h1,19); h1 += h2; h1 = h1*5+0x561ccd1b; - - k2 *= c2; k2 = ROTL32(k2,16); k2 *= c3; h2 ^= k2; - - h2 = ROTL32(h2,17); h2 += h3; h2 = h2*5+0x0bcaa747; - - k3 *= c3; k3 = ROTL32(k3,17); k3 *= c4; h3 ^= k3; - - h3 = ROTL32(h3,15); h3 += h4; h3 = h3*5+0x96cd1c35; - - k4 *= c4; k4 = ROTL32(k4,18); k4 *= c1; h4 ^= k4; - - h4 = ROTL32(h4,13); h4 += h1; h4 = h4*5+0x32ac3b17; - } - - //---------- - // tail - - tail = (const uint8_t*)(data + nblocks*16); - - switch(len & 15) - { - case 15: k4 ^= tail[14] << 16; - case 14: k4 ^= tail[13] << 8; - case 13: k4 ^= tail[12] << 0; - k4 *= c4; k4 = ROTL32(k4,18); k4 *= c1; h4 ^= k4; - - case 12: k3 ^= tail[11] << 24; - case 11: k3 ^= tail[10] << 16; - case 10: k3 ^= tail[ 9] << 8; - case 9: k3 ^= tail[ 8] << 0; - k3 *= c3; k3 = ROTL32(k3,17); k3 *= c4; h3 ^= k3; - - case 8: k2 ^= tail[ 7] << 24; - case 7: k2 ^= tail[ 6] << 16; - case 6: k2 ^= tail[ 5] << 8; - case 5: k2 ^= tail[ 4] << 0; - k2 *= c2; k2 = ROTL32(k2,16); k2 *= c3; h2 ^= k2; - - case 4: k1 ^= tail[ 3] << 24; - case 3: k1 ^= tail[ 2] << 16; - case 2: k1 ^= tail[ 1] << 8; - case 1: k1 ^= tail[ 0] << 0; - k1 *= c1; k1 = ROTL32(k1,15); k1 *= c2; h1 ^= k1; - default: break; - }; - - //---------- - // finalization - - h1 ^= len; h2 ^= len; h3 ^= len; h4 ^= len; - - h1 += h2; h1 += h3; h1 += h4; - h2 += h1; h3 += h1; h4 += h1; - - h1 = fmix32(h1); - h2 = fmix32(h2); - h3 = fmix32(h3); - h4 = fmix32(h4); - - h1 += h2; h1 += h3; h1 += h4; - h2 += h1; h3 += h1; h4 += h1; - - putblock((uint32_t*)out, 0, h1); - putblock((uint32_t*)out, 1, h2); - putblock((uint32_t*)out, 2, h3); - putblock((uint32_t*)out, 3, h4); -} - -//----------------------------------------------------------------------------- - -void MurmurHash3_x64_128 ( const void * key, const int len, - const uint32_t seed, void * out ) -{ - const uint8_t * data = (const uint8_t*)key; - const int nblocks = len / 16; - - uint64_t h1 = seed; - uint64_t h2 = seed; - - uint64_t c1 = BIG_CONSTANT(0x87c37b91114253d5); - uint64_t c2 = BIG_CONSTANT(0x4cf5ad432745937f); - - const uint8_t *tail; - uint64_t k1 = 0, k2 = 0; - - //---------- - // body - - const uint64_t * blocks = (const uint64_t *)(data); - - int i; - for(i = 0; i < nblocks; i++) - { - uint64_t k1 = getblock64(blocks,i*2+0); - uint64_t k2 = getblock64(blocks,i*2+1); - - k1 *= c1; k1 = ROTL64(k1,31); k1 *= c2; h1 ^= k1; - - h1 = ROTL64(h1,27); h1 += h2; h1 = h1*5+0x52dce729; - - k2 *= c2; k2 = ROTL64(k2,33); k2 *= c1; h2 ^= k2; - - h2 = ROTL64(h2,31); h2 += h1; h2 = h2*5+0x38495ab5; - } - - //---------- - // tail - - tail = (const uint8_t*)(data + nblocks*16); - - switch(len & 15) - { - case 15: k2 ^= ((uint64_t)tail[14]) << 48; - case 14: k2 ^= ((uint64_t)tail[13]) << 40; - case 13: k2 ^= ((uint64_t)tail[12]) << 32; - case 12: k2 ^= ((uint64_t)tail[11]) << 24; - case 11: k2 ^= ((uint64_t)tail[10]) << 16; - case 10: k2 ^= ((uint64_t)tail[ 9]) << 8; - case 9: k2 ^= ((uint64_t)tail[ 8]) << 0; - k2 *= c2; k2 = ROTL64(k2,33); k2 *= c1; h2 ^= k2; - - case 8: k1 ^= ((uint64_t)tail[ 7]) << 56; - case 7: k1 ^= ((uint64_t)tail[ 6]) << 48; - case 6: k1 ^= ((uint64_t)tail[ 5]) << 40; - case 5: k1 ^= ((uint64_t)tail[ 4]) << 32; - case 4: k1 ^= ((uint64_t)tail[ 3]) << 24; - case 3: k1 ^= ((uint64_t)tail[ 2]) << 16; - case 2: k1 ^= ((uint64_t)tail[ 1]) << 8; - case 1: k1 ^= ((uint64_t)tail[ 0]) << 0; - k1 *= c1; k1 = ROTL64(k1,31); k1 *= c2; h1 ^= k1; - default: break; - }; - - //---------- - // finalization - - h1 ^= len; h2 ^= len; - - h1 += h2; - h2 += h1; - - h1 = fmix64(h1); - h2 = fmix64(h2); - - h1 += h2; - h2 += h1; - - putblock64((uint64_t*)out, 0, h1); - putblock64((uint64_t*)out, 1, h2); -} diff --git a/uds/murmur/MurmurHash3.h b/uds/murmur/MurmurHash3.h deleted file mode 100644 index ae9ab903..00000000 --- a/uds/murmur/MurmurHash3.h +++ /dev/null @@ -1,28 +0,0 @@ -//----------------------------------------------------------------------------- -// MurmurHash3 was written by Austin Appleby, and is placed in the public -// domain. The author hereby disclaims copyright to this source code. - -#ifndef _MURMURHASH3_H_ -#define _MURMURHASH3_H_ - -//----------------------------------------------------------------------------- -// Platform-specific functions and macros - -// Linux kernel - -# include - -// Microsoft Visual Studio - - -//----------------------------------------------------------------------------- - -void MurmurHash3_x86_32 ( const void * key, int len, uint32_t seed, void * out ); - -void MurmurHash3_x86_128 ( const void * key, int len, uint32_t seed, void * out ); - -void MurmurHash3_x64_128 ( const void * key, int len, uint32_t seed, void * out ); - -//----------------------------------------------------------------------------- - -#endif // _MURMURHASH3_H_ diff --git a/uds/nonce.c b/uds/nonce.c deleted file mode 100644 index d4ab1810..00000000 --- a/uds/nonce.c +++ /dev/null @@ -1,69 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/krusty/src/uds/nonce.c#16 $ - */ - -#include "nonce.h" - -#include "murmur/MurmurHash3.h" -#include "numeric.h" -#include "random.h" -#include "stringUtils.h" -#include "timeUtils.h" - -/**********************************************************************/ -static uint64_t hash_stuff(uint64_t start, const void *data, size_t len) -{ - uint32_t seed = start ^ (start >> 27); - byte hash_buffer[16]; - MurmurHash3_x64_128(data, len, seed, hash_buffer); - return get_unaligned_le64(hash_buffer + 4); -} - -/**********************************************************************/ -void create_unique_nonce_data(byte *buffer) -{ - ktime_t now = current_time_ns(CLOCK_REALTIME); - uint32_t rand = random_in_range(1, (1 << 30) - 1); - size_t offset = 0; - - // Fill NONCE_INFO_SIZE bytes with copies of the time and a - // pseudorandom number. - memcpy(buffer + offset, &now, sizeof(now)); - offset += sizeof(now); - memcpy(buffer + offset, &rand, sizeof(rand)); - offset += sizeof(rand); - while (offset < NONCE_INFO_SIZE) { - size_t len = min(NONCE_INFO_SIZE - offset, offset); - memcpy(buffer + offset, buffer, len); - offset += len; - } -} - -/**********************************************************************/ -uint64_t generate_primary_nonce(const void *data, size_t len) -{ - return hash_stuff(0xa1b1e0fc, data, len); -} - -/**********************************************************************/ -uint64_t generate_secondary_nonce(uint64_t nonce, const void *data, size_t len) -{ - return hash_stuff(nonce + 1, data, len); -} diff --git a/uds/nonce.h b/uds/nonce.h deleted file mode 100644 index a2e688b6..00000000 --- a/uds/nonce.h +++ /dev/null @@ -1,62 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/krusty/src/uds/nonce.h#7 $ - */ - -#ifndef NONCE_H -#define NONCE_H - -#include "typeDefs.h" - -enum { NONCE_INFO_SIZE = 32 }; - -/** - * Create NONCE_INFO_SIZE (32) bytes of unique data for generating a - * nonce, using the current time and a pseudorandom number. - * - * @param buffer Where to put the data - **/ -void create_unique_nonce_data(byte *buffer); - -/** - * Generate a primary nonce, using the specified data. - * - * @param data Some arbitrary information. - * @param len The length of the information. - * - * @return a number which will be fairly unique - **/ -uint64_t generate_primary_nonce(const void *data, size_t len); - -/** - * Deterministically generate a secondary nonce based on an existing - * nonce and some arbitrary data. Effectively hashes the nonce and - * the data to produce a new nonce which is deterministic. - * - * @param nonce An existing nonce which is well known. - * @param data Some data related to the creation of this nonce. - * @param len The length of the data. - * - * @return a number which will be fairly unique and depend solely on - * the nonce and the data. - **/ -uint64_t -generate_secondary_nonce(uint64_t nonce, const void *data, size_t len); - -#endif // NONCE_H diff --git a/uds/openChapter.c b/uds/openChapter.c deleted file mode 100644 index 5da67b43..00000000 --- a/uds/openChapter.c +++ /dev/null @@ -1,370 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/krusty/src/uds/openChapter.c#37 $ - */ - -#include "openChapter.h" - -#include "compiler.h" -#include "logger.h" -#include "memoryAlloc.h" -#include "numeric.h" -#include "zone.h" - -static int read_open_chapters(struct read_portal *portal); -static int write_open_chapters(struct index_component *component, - struct buffered_writer *writer, - unsigned int zone); - -const struct index_component_info OPEN_CHAPTER_INFO = { - .kind = RL_KIND_OPEN_CHAPTER, - .name = "open chapter", - .save_only = true, - .chapter_sync = false, - .multi_zone = false, - .io_storage = true, - .loader = read_open_chapters, - .saver = write_open_chapters, - .incremental = NULL, -}; - -static const byte OPEN_CHAPTER_MAGIC[] = "ALBOC"; -static const byte OPEN_CHAPTER_VERSION[] = "02.00"; - -enum { - OPEN_CHAPTER_MAGIC_LENGTH = sizeof(OPEN_CHAPTER_MAGIC) - 1, - OPEN_CHAPTER_VERSION_LENGTH = sizeof(OPEN_CHAPTER_VERSION) - 1 -}; - -/**********************************************************************/ -static int fill_delta_chapter_index(struct open_chapter_zone **chapter_zones, - unsigned int zone_count, - struct open_chapter_index *index, - struct uds_chunk_record *collated_records) -{ - // Find a record to replace any deleted records, and fill the chapter - // if it was closed early. The last record in any filled zone is - // guaranteed to not have been deleted in this chapter, so use one of - // those. - struct open_chapter_zone *fill_chapter_zone = NULL; - struct uds_chunk_record *fill_record = NULL; - unsigned int z, pages_per_chapter, records_per_page, page; - unsigned int records_added = 0, zone = 0; - int result, overflow_count = 0; - const struct geometry *geometry; - - for (z = 0; z < zone_count; ++z) { - fill_chapter_zone = chapter_zones[z]; - if (fill_chapter_zone->size == fill_chapter_zone->capacity) { - fill_record = - &fill_chapter_zone - ->records[fill_chapter_zone->size]; - break; - } - } - result = - ASSERT((fill_record != NULL), "some open chapter zone filled"); - if (result != UDS_SUCCESS) { - return result; - } - result = ASSERT(!fill_chapter_zone->slots[fill_chapter_zone->size] - .record_deleted, - "chapter fill record not deleted"); - if (result != UDS_SUCCESS) { - return result; - } - - geometry = index->geometry; - pages_per_chapter = geometry->record_pages_per_chapter; - records_per_page = geometry->records_per_page; - - for (page = 0; page < pages_per_chapter; page++) { - unsigned int i; - for (i = 0; i < records_per_page; - i++, records_added++, zone = (zone + 1) % zone_count) { - struct uds_chunk_record *next_record; - // The record arrays are 1-based. - unsigned int record_number = - 1 + (records_added / zone_count); - - // If the zone has been exhausted, or the record was - // deleted, add the fill record to the chapter. - if (record_number > chapter_zones[zone]->size || - chapter_zones[zone] - ->slots[record_number] - .record_deleted) { - collated_records[1 + records_added] = - *fill_record; - continue; - } - - next_record = - &chapter_zones[zone]->records[record_number]; - collated_records[1 + records_added] = *next_record; - - result = put_open_chapter_index_record(index, - &next_record->name, - page); - switch (result) { - case UDS_SUCCESS: - break; - case UDS_OVERFLOW: - overflow_count++; - break; - default: - uds_log_error_strerror(result, - "failed to build open chapter index"); - return result; - } - } - } - if (overflow_count > 0) { - uds_log_warning("Failed to add %d entries to chapter index", - overflow_count); - } - return UDS_SUCCESS; -} - -/**********************************************************************/ -int close_open_chapter(struct open_chapter_zone **chapter_zones, - unsigned int zone_count, - struct volume *volume, - struct open_chapter_index *chapter_index, - struct uds_chunk_record *collated_records, - uint64_t virtual_chapter_number) -{ - int result; - - // Empty the delta chapter index, and prepare it for the new virtual - // chapter. - empty_open_chapter_index(chapter_index, virtual_chapter_number); - - // Map each non-deleted record name to its record page number in the - // delta chapter index. - result = fill_delta_chapter_index(chapter_zones, zone_count, - chapter_index, collated_records); - if (result != UDS_SUCCESS) { - return result; - } - - // Pass the populated chapter index and the records to the volume, - // which will generate and write the index and record pages for the - // chapter. - return write_chapter(volume, chapter_index, collated_records); -} - -/**********************************************************************/ -int save_open_chapters(struct uds_index *index, struct buffered_writer *writer) -{ - uint32_t total_records = 0, records_added = 0; - unsigned int i, record_index; - byte total_record_data[sizeof(total_records)]; - int result = write_to_buffered_writer(writer, OPEN_CHAPTER_MAGIC, - OPEN_CHAPTER_MAGIC_LENGTH); - if (result != UDS_SUCCESS) { - return result; - } - - result = write_to_buffered_writer(writer, OPEN_CHAPTER_VERSION, - OPEN_CHAPTER_VERSION_LENGTH); - if (result != UDS_SUCCESS) { - return result; - } - - for (i = 0; i < index->zone_count; i++) { - total_records += - open_chapter_size(index->zones[i]->open_chapter); - } - - // Store the record count in little-endian order. - put_unaligned_le32(total_records, total_record_data); - - result = write_to_buffered_writer(writer, total_record_data, - sizeof(total_record_data)); - if (result != UDS_SUCCESS) { - return result; - } - - // Only write out the records that have been added and not deleted. - record_index = 1; - while (records_added < total_records) { - unsigned int i; - for (i = 0; i < index->zone_count; i++) { - struct open_chapter_zone *open_chapter = - index->zones[i]->open_chapter; - struct uds_chunk_record *record; - if (record_index > open_chapter->size) { - continue; - } - if (open_chapter->slots[record_index].record_deleted) { - continue; - } - record = &open_chapter->records[record_index]; - result = write_to_buffered_writer(writer, - record, - sizeof(struct uds_chunk_record)); - if (result != UDS_SUCCESS) { - return result; - } - records_added++; - } - record_index++; - } - - return flush_buffered_writer(writer); -} - -/**********************************************************************/ -uint64_t compute_saved_open_chapter_size(struct geometry *geometry) -{ - return OPEN_CHAPTER_MAGIC_LENGTH + OPEN_CHAPTER_VERSION_LENGTH + - sizeof(uint32_t) + - geometry->records_per_chapter * sizeof(struct uds_chunk_record); -} - -/**********************************************************************/ -static int write_open_chapters(struct index_component *component, - struct buffered_writer *writer, - unsigned int zone) -{ - struct uds_index *index; - int result = ASSERT((zone == 0), "open chapter write not zoned"); - if (result != UDS_SUCCESS) { - return result; - } - - index = index_component_data(component); - return save_open_chapters(index, writer); -} - -/** - * Read the version field from a buffered reader, checking whether it is a - * supported version. Returns (via a pointer parameter) the matching - * version constant, which can be used by comparing to the version - * constants using simple pointer equality. - * - * @param [in] reader A buffered reader. - * @param [out] version The version constant that was matched. - * - * @return UDS_SUCCESS or an error code if the file could not be read or - * the version is invalid or unsupported - **/ -static int read_version(struct buffered_reader *reader, const byte **version) -{ - byte buffer[OPEN_CHAPTER_VERSION_LENGTH]; - int result = read_from_buffered_reader(reader, buffer, sizeof(buffer)); - if (result != UDS_SUCCESS) { - return result; - } - if (memcmp(OPEN_CHAPTER_VERSION, buffer, sizeof(buffer)) != 0) { - return uds_log_error_strerror(UDS_CORRUPT_COMPONENT, - "Invalid open chapter version: %.*s", - (int) sizeof(buffer), - buffer); - } - *version = OPEN_CHAPTER_VERSION; - return UDS_SUCCESS; -} - -/**********************************************************************/ -static int load_version20(struct uds_index *index, - struct buffered_reader *reader) -{ - uint32_t num_records, records; - byte num_records_data[sizeof(uint32_t)]; - struct uds_chunk_record record; - - // Keep track of which zones cannot accept any more records. - bool full_flags[MAX_ZONES] = { - false, - }; - - int result = read_from_buffered_reader(reader, &num_records_data, - sizeof(num_records_data)); - if (result != UDS_SUCCESS) { - return result; - } - num_records = get_unaligned_le32(num_records_data); - - // Assign records to the correct zones. - for (records = 0; records < num_records; records++) { - unsigned int zone = 0; - result = read_from_buffered_reader(reader, &record, - sizeof(struct uds_chunk_record)); - if (result != UDS_SUCCESS) { - return result; - } - - if (index->zone_count > 1) { - // A read-only index has no volume index, but it also - // has only one zone. - zone = get_volume_index_zone(index->volume_index, - &record.name); - } - // Add records until the open chapter zone almost runs out of - // space. The chapter can't be closed here, so don't add the - // last record. - if (!full_flags[zone]) { - unsigned int remaining; - result = put_open_chapter(index->zones[zone]->open_chapter, - &record.name, - &record.data, - &remaining); - full_flags[zone] = (remaining <= 1); - if (result != UDS_SUCCESS) { - return result; - } - } - } - - return UDS_SUCCESS; -} - -/**********************************************************************/ -int load_open_chapters(struct uds_index *index, struct buffered_reader *reader) -{ - const byte *version = NULL; - // Read and check the magic number. - int result = verify_buffered_data(reader, OPEN_CHAPTER_MAGIC, - OPEN_CHAPTER_MAGIC_LENGTH); - if (result != UDS_SUCCESS) { - return result; - } - - // Read and check the version. - result = read_version(reader, &version); - if (result != UDS_SUCCESS) { - return result; - } - - return load_version20(index, reader); -} - -/**********************************************************************/ -int read_open_chapters(struct read_portal *portal) -{ - struct uds_index *index = index_component_data(portal->component); - - struct buffered_reader *reader; - int result = get_buffered_reader_for_portal(portal, 0, &reader); - if (result != UDS_SUCCESS) { - return result; - } - return load_open_chapters(index, reader); -} diff --git a/uds/openChapter.h b/uds/openChapter.h deleted file mode 100644 index 9d98cd54..00000000 --- a/uds/openChapter.h +++ /dev/null @@ -1,100 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/krusty/src/uds/openChapter.h#15 $ - */ - -#ifndef OPENCHAPTER_H -#define OPENCHAPTER_H 1 - -#include "common.h" -#include "geometry.h" -#include "index.h" -#include "indexComponent.h" - -extern const struct index_component_info OPEN_CHAPTER_INFO; - -/** - * OpenChapter handles writing the open chapter records to the volume. It also - * manages the open chapter index component, and all the tools to generate and - * parse the open chapter file. The open chapter file interleaves records from - * each openChapterZone structure. - * - *

Once each open chapter zone is filled, the records are interleaved to - * preserve temporal locality, the index pages are generated through a - * delta chapter index, and the record pages are derived by sorting each - * page-sized batch of records by their names. - * - *

Upon index shutdown, the open chapter zone records are again - * interleaved, and the records are stored as a single array. The hash - * slots are not preserved, since the records may be reassigned to new - * zones at load time. - **/ - -/** - * Close the open chapter and write it to disk. - * - * @param chapter_zones The zones of the chapter to close - * @param zone_count The number of zones - * @param volume The volume to which to write the chapter - * @param chapter_index The open_chapter_index to use while writing - * @param collated_records Collated records array to use while writing - * @param virtual_chapter_number The virtual chapter number of the open chapter - * - * @return UDS_SUCCESS or an error code - **/ -int __must_check close_open_chapter(struct open_chapter_zone **chapter_zones, - unsigned int zone_count, - struct volume *volume, - struct open_chapter_index *chapter_index, - struct uds_chunk_record *collated_records, - uint64_t virtual_chapter_number); - -/** - * Write out a partially filled chapter to a file. - * - * @param index the index to save the data from - * @param writer the writer to write out the chapters - * - * @return UDS_SUCCESS on success - **/ -int __must_check save_open_chapters(struct uds_index *index, - struct buffered_writer *writer); - -/** - * Read a partially filled chapter from a file. - * - * @param index the index to load the data into - * @param reader the buffered reader to read from - * - * @return UDS_SUCCESS on success - **/ -int __must_check load_open_chapters(struct uds_index *index, - struct buffered_reader *reader); - -/** - * Compute the size of the maximum open chapter save image. - * - * @param geometry the index geometry - * - * @return the number of bytes of the largest possible open chapter save - * image - **/ -uint64_t compute_saved_open_chapter_size(struct geometry *geometry); - -#endif /* OPENCHAPTER_H */ diff --git a/uds/openChapterZone.c b/uds/openChapterZone.c deleted file mode 100644 index c7595978..00000000 --- a/uds/openChapterZone.c +++ /dev/null @@ -1,271 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/krusty/src/uds/openChapterZone.c#21 $ - */ - -#include "openChapterZone.h" - -#include "compiler.h" -#include "hashUtils.h" -#include "logger.h" -#include "memoryAlloc.h" -#include "permassert.h" - -/**********************************************************************/ -static INLINE size_t records_size(const struct open_chapter_zone *open_chapter) -{ - return (sizeof(struct uds_chunk_record) * - (1 + open_chapter->capacity)); -} - -/**********************************************************************/ -static INLINE size_t slots_size(size_t slot_count) -{ - return (sizeof(struct open_chapter_zone_slot) * slot_count); -} - -/** - * Round up to the first power of two greater than or equal - * to the supplied number. - * - * @param val the number to round up - * - * @return the first power of two not smaller than val for any - * val <= 2^63 - **/ -static INLINE size_t next_power_of_two(size_t val) -{ - if (val == 0) { - return 1; - } - return (1 << compute_bits(val - 1)); -} - -/**********************************************************************/ -int make_open_chapter(const struct geometry *geometry, - unsigned int zone_count, - struct open_chapter_zone **open_chapter_ptr) -{ - struct open_chapter_zone *open_chapter; - size_t capacity, slot_count; - int result = ASSERT(zone_count > 0, "zone count must be > 0"); - if (result != UDS_SUCCESS) { - return result; - } - result = - ASSERT_WITH_ERROR_CODE(geometry->open_chapter_load_ratio > 1, - UDS_BAD_STATE, - "Open chapter hash table is too small"); - if (result != UDS_SUCCESS) { - return result; - } - result = ASSERT_WITH_ERROR_CODE((geometry->records_per_chapter <= - OPEN_CHAPTER_MAX_RECORD_NUMBER), - UDS_BAD_STATE, - "Too many records (%u) for a single chapter", - geometry->records_per_chapter); - if (result != UDS_SUCCESS) { - return result; - } - - if (geometry->records_per_chapter < zone_count) { - return uds_log_error_strerror(UDS_INVALID_ARGUMENT, - "zone count: %u is larger than the records per chapter %u", - zone_count, - geometry->records_per_chapter); - } - capacity = geometry->records_per_chapter / zone_count; - - // The slot count must be at least one greater than the capacity. - // Using a power of two slot count guarantees that hash insertion - // will never fail if the hash table is not full. - slot_count = next_power_of_two(capacity * - geometry->open_chapter_load_ratio); - result = UDS_ALLOCATE_EXTENDED(struct open_chapter_zone, - slot_count, - struct open_chapter_zone_slot, - "open chapter", - &open_chapter); - if (result != UDS_SUCCESS) { - return result; - } - open_chapter->slot_count = slot_count; - open_chapter->capacity = capacity; - result = uds_allocate_cache_aligned(records_size(open_chapter), - "record pages", - &open_chapter->records); - if (result != UDS_SUCCESS) { - free_open_chapter(open_chapter); - return result; - } - - *open_chapter_ptr = open_chapter; - return UDS_SUCCESS; -} - -/**********************************************************************/ -size_t open_chapter_size(const struct open_chapter_zone *open_chapter) -{ - return open_chapter->size - open_chapter->deleted; -} - -/**********************************************************************/ -void reset_open_chapter(struct open_chapter_zone *open_chapter) -{ - open_chapter->size = 0; - open_chapter->deleted = 0; - - memset(open_chapter->records, 0, records_size(open_chapter)); - memset(open_chapter->slots, 0, slots_size(open_chapter->slot_count)); -} - -/**********************************************************************/ -static struct uds_chunk_record * -probe_chapter_slots(struct open_chapter_zone *open_chapter, - const struct uds_chunk_name *name, - unsigned int *slot_ptr, - unsigned int *record_number_ptr) -{ - unsigned int slots = open_chapter->slot_count; - unsigned int probe = name_to_hash_slot(name, slots); - unsigned int first_slot = 0; - - struct uds_chunk_record *record; - unsigned int probe_slot; - unsigned int record_number; - unsigned int probe_attempts; - - for (probe_attempts = 1;; ++probe_attempts) { - probe_slot = first_slot + probe; - record_number = open_chapter->slots[probe_slot].record_number; - - // If the hash slot is empty, we've reached the end of a chain - // without finding the record and should terminate the search. - if (record_number == 0) { - record = NULL; - break; - } - - // If the name of the record referenced by the slot matches and - // has not been deleted, then we've found the requested name. - record = &open_chapter->records[record_number]; - if ((memcmp(&record->name, name, UDS_CHUNK_NAME_SIZE) == 0) && - !open_chapter->slots[record_number].record_deleted) { - break; - } - - // Quadratic probing: advance the probe by 1, 2, 3, etc. and - // try again. This performs better than linear probing and - // works best for 2^N slots. - probe += probe_attempts; - if (probe >= slots) { - probe = probe % slots; - } - } - - // These NULL checks will be optimized away in callers who don't care - // about the values when this function is inlined. - if (slot_ptr != NULL) { - *slot_ptr = probe_slot; - } - if (record_number_ptr != NULL) { - *record_number_ptr = record_number; - } - - return record; -} - -/**********************************************************************/ -void search_open_chapter(struct open_chapter_zone *open_chapter, - const struct uds_chunk_name *name, - struct uds_chunk_data *metadata, - bool *found) -{ - struct uds_chunk_record *record = - probe_chapter_slots(open_chapter, name, NULL, NULL); - - if (record == NULL) { - *found = false; - } else { - *found = true; - if (metadata != NULL) { - *metadata = record->data; - } - } -} - -/**********************************************************************/ -int put_open_chapter(struct open_chapter_zone *open_chapter, - const struct uds_chunk_name *name, - const struct uds_chunk_data *metadata, - unsigned int *remaining) -{ - unsigned int slot, record_number; - struct uds_chunk_record *record = - probe_chapter_slots(open_chapter, name, &slot, NULL); - - if (record != NULL) { - record->data = *metadata; - *remaining = open_chapter->capacity - open_chapter->size; - return UDS_SUCCESS; - } - - if (open_chapter->size >= open_chapter->capacity) { - return UDS_VOLUME_OVERFLOW; - } - - record_number = ++open_chapter->size; - open_chapter->slots[slot].record_number = record_number; - record = &open_chapter->records[record_number]; - record->name = *name; - record->data = *metadata; - - *remaining = open_chapter->capacity - open_chapter->size; - return UDS_SUCCESS; -} - -/**********************************************************************/ -void remove_from_open_chapter(struct open_chapter_zone *open_chapter, - const struct uds_chunk_name *name, - bool *removed) -{ - unsigned int record_number; - struct uds_chunk_record *record = - probe_chapter_slots(open_chapter, name, NULL, &record_number); - - if (record == NULL) { - *removed = false; - return; - } - - // Set the deleted flag on the record_number in the slot array so - // search won't find it and close won't index it. - open_chapter->slots[record_number].record_deleted = true; - open_chapter->deleted += 1; - *removed = true; -} - -/**********************************************************************/ -void free_open_chapter(struct open_chapter_zone *open_chapter) -{ - if (open_chapter != NULL) { - UDS_FREE(open_chapter->records); - UDS_FREE(open_chapter); - } -} diff --git a/uds/openChapterZone.h b/uds/openChapterZone.h deleted file mode 100644 index af9574f4..00000000 --- a/uds/openChapterZone.h +++ /dev/null @@ -1,167 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/krusty/src/uds/openChapterZone.h#12 $ - */ - -#ifndef OPEN_CHAPTER_ZONE_H -#define OPEN_CHAPTER_ZONE_H 1 - -#include "common.h" -#include "geometry.h" -#include "typeDefs.h" - -/** - * open_chapter_zone is the mutable, in-memory representation of one zone's - * section of an Albireo index chapter. - * - *

In addition to providing the same access to records as an on-disk - * chapter, the open chapter zone must allow records to be added or - * modified. It must provide a way to generate the on-disk representation - * without excessive work. It does that by accumulating records in the order - * they are added (maintaining temporal locality), and referencing them (as - * record numbers) from hash slots selected from the name. If the metadata for - * a name changes, the record field is just modified in place. - * - *

Storage for the records (names and metadata) is allocated when the zone - * is created. It keeps no references to the data passed to it, and performs - * no additional allocation when adding records. Opening a new chapter simply - * marks it as being empty. - * - *

Records are stored in a flat array. To allow a value of zero in a - * hash slot to indicate that the slot is empty, records are numbered starting - * at one (1-based). Since C arrays are 0-based, the records array contains - * enough space for N+1 records, and the record that starts at array index - * zero is never used or referenced. - * - *

The array of hash slots is actually two arrays, superimposed: an - * array of record numbers, indexed by hash value, and an array of deleted - * flags, indexed by record number. This overlay is possible because the - * number of hash slots always exceeds the number of records, and is done - * simply to save on memory. - **/ - -enum { - OPEN_CHAPTER_RECORD_NUMBER_BITS = 23, - OPEN_CHAPTER_MAX_RECORD_NUMBER = - (1 << OPEN_CHAPTER_RECORD_NUMBER_BITS) - 1 -}; - -struct open_chapter_zone_slot { - /** If non-zero, the record number addressed by this hash slot */ - unsigned int record_number : OPEN_CHAPTER_RECORD_NUMBER_BITS; - /** If true, the record at the index of this hash slot was deleted */ - bool record_deleted : 1; -} __packed; - -struct open_chapter_zone { - /** Maximum number of records that can be stored */ - unsigned int capacity; - /** Number of records stored */ - unsigned int size; - /** Number of deleted records */ - unsigned int deleted; - /** Record data, stored as (name, metadata), 1-based */ - struct uds_chunk_record *records; - /** The number of slots in the chapter zone hash table. */ - unsigned int slot_count; - /** Hash table, referencing virtual record numbers */ - struct open_chapter_zone_slot slots[]; -}; - -/** - * Allocate an open chapter zone. - * - * @param geometry the geometry of the volume - * @param zone_count the total number of open chapter zones - * @param open_chapter_ptr a pointer to hold the new open chapter - * - * @return UDS_SUCCESS or an error code - **/ -int __must_check -make_open_chapter(const struct geometry *geometry, - unsigned int zone_count, - struct open_chapter_zone **open_chapter_ptr); - -/** - * Return the number of records in the open chapter zone that have not been - * deleted. - * - * @return the number of non-deleted records - **/ -size_t __must_check -open_chapter_size(const struct open_chapter_zone *open_chapter); - -/** - * Open a chapter by marking it empty. - * - * @param open_chapter The chapter to open - **/ -void reset_open_chapter(struct open_chapter_zone *open_chapter); - -/** - * Search the open chapter for a chunk name. - * - * @param open_chapter The chapter to search - * @param name The name of the desired chunk - * @param metadata The holder for the metadata associated with the - * chunk, if found (or NULL) - * @param found A pointer which will be set to true if the chunk - * name was found - **/ -void search_open_chapter(struct open_chapter_zone *open_chapter, - const struct uds_chunk_name *name, - struct uds_chunk_data *metadata, - bool *found); - -/** - * Put a record into the open chapter. - * - * @param open_chapter The chapter into which to put the record - * @param name The name of the record - * @param metadata The record data - * @param remaining Pointer to an integer set to the number of additional - * records that can be added to this chapter - * - * @return UDS_SUCCESS or an error code - **/ -int __must_check put_open_chapter(struct open_chapter_zone *open_chapter, - const struct uds_chunk_name *name, - const struct uds_chunk_data *metadata, - unsigned int *remaining); - -/** - * Remove a record from the open chapter. - * - * @param open_chapter The chapter from which to remove the record - * @param name The name of the record - * @param removed Pointer to bool set to true if the - * record was found - **/ -void remove_from_open_chapter(struct open_chapter_zone *open_chapter, - const struct uds_chunk_name *name, - bool *removed); - -/** - * Clean up an open chapter and its memory. - * - * @param open_chapter the chapter to destroy - **/ -void free_open_chapter(struct open_chapter_zone *open_chapter); - -#endif /* OPEN_CHAPTER_ZONE_H */ diff --git a/uds/permassert.c b/uds/permassert.c deleted file mode 100644 index c1904b63..00000000 --- a/uds/permassert.c +++ /dev/null @@ -1,55 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/krusty/src/uds/permassert.c#19 $ - */ - -#include "permassert.h" - -#include "errors.h" -#include "logger.h" - - -/**********************************************************************/ -int uds_assertion_failed(const char *expression_string, - int code, - const char *module_name, - const char *file_name, - int line_number, - const char *format, - ...) -{ - va_list args; - va_start(args, format); - - uds_log_embedded_message(UDS_LOG_ERR, - module_name, - "assertion \"", - format, - args, - "\" (%s) failed at %s:%d", - expression_string, - file_name, - line_number); - uds_log_backtrace(UDS_LOG_ERR); - - - va_end(args); - - return code; -} diff --git a/uds/random.c b/uds/random.c deleted file mode 100644 index 931d7ece..00000000 --- a/uds/random.c +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/krusty/src/uds/random.c#4 $ - */ - -#include "random.h" - -#include "permassert.h" - -/**********************************************************************/ -unsigned int random_in_range(unsigned int lo, unsigned int hi) -{ - return lo + random() % (hi - lo + 1); -} - -/**********************************************************************/ -void random_compile_time_assertions(void) -{ - STATIC_ASSERT((((uint64_t) RAND_MAX + 1) & RAND_MAX) == 0); -} - diff --git a/uds/regionIdentifiers.h b/uds/regionIdentifiers.h deleted file mode 100644 index 08264cf2..00000000 --- a/uds/regionIdentifiers.h +++ /dev/null @@ -1,51 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/krusty/src/uds/regionIdentifiers.h#7 $ - */ - -#ifndef REGION_IDENTIFIERS_H -#define REGION_IDENTIFIERS_H - -enum region_type { - RH_TYPE_FREE = 0, // unused - RH_TYPE_SUPER = 1, - RH_TYPE_SAVE = 2, - RH_TYPE_CHECKPOINT = 3, - RH_TYPE_UNSAVED = 4, -}; - -enum region_kind { - RL_KIND_SCRATCH = 0, // uninitialized or scrapped - RL_KIND_HEADER = 1, // for self-referential items - RL_KIND_CONFIG = 100, - RL_KIND_INDEX = 101, - RL_KIND_SEAL = 102, - RL_KIND_VOLUME = 201, - RL_KIND_SAVE = 202, - RL_KIND_INDEX_PAGE_MAP = 301, - RL_KIND_VOLUME_INDEX = 302, - RL_KIND_OPEN_CHAPTER = 303, - RL_KIND_INDEX_STATE = 401, // not saved as region -}; - -enum { - RL_SOLE_INSTANCE = 65535, -}; - -#endif // REGION_IDENTIFIERS_H diff --git a/uds/request.c b/uds/request.c deleted file mode 100644 index 99c92707..00000000 --- a/uds/request.c +++ /dev/null @@ -1,233 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/krusty/src/uds/request.c#28 $ - */ - -#include "request.h" - -#include "index.h" -#include "indexSession.h" -#include "logger.h" -#include "memoryAlloc.h" -#include "permassert.h" -#include "requestQueue.h" - -/**********************************************************************/ -int uds_start_chunk_operation(struct uds_request *request) -{ - size_t internal_size; - int result; - - if (request->callback == NULL) { - uds_log_error("missing required callback"); - return -EINVAL; - } - switch (request->type) { - case UDS_DELETE: - case UDS_POST: - case UDS_QUERY: - case UDS_UPDATE: - break; - default: - uds_log_error("received invalid callback type"); - return -EINVAL; - } - - // Reset all internal fields before processing. - internal_size = sizeof(struct uds_request) - - offsetof(struct uds_request, zone_number); - memset(&request->zone_number, 0, internal_size); - - result = get_index_session(request->session); - if (result != UDS_SUCCESS) { - return result; - } - - request->found = false; - request->unbatched = false; - request->index = request->session->index; - - enqueue_request(request, STAGE_TRIAGE); - return UDS_SUCCESS; -} - -/**********************************************************************/ -int launch_zone_message(struct uds_zone_message message, - unsigned int zone, - struct uds_index *index) -{ - struct uds_request *request; - int result = UDS_ALLOCATE(1, struct uds_request, __func__, &request); - if (result != UDS_SUCCESS) { - return result; - } - - request->index = index; - request->unbatched = true; - request->zone_number = zone; - request->zone_message = message; - - enqueue_request(request, STAGE_MESSAGE); - return UDS_SUCCESS; -} - -/**********************************************************************/ -static struct uds_request_queue * -get_next_stage_queue(struct uds_request *request, - enum request_stage next_stage) -{ - if (next_stage == STAGE_CALLBACK) { - return request->session->callback_queue; - } - - return select_index_queue(request->index, request, next_stage); -} - -/**********************************************************************/ -void enqueue_request(struct uds_request *request, - enum request_stage next_stage) -{ - struct uds_request_queue *next_queue = - get_next_stage_queue(request, next_stage); - if (next_queue == NULL) { - return; - } - - uds_request_queue_enqueue(next_queue, request); -} - -/* - * This function pointer allows unit test code to intercept the slow-lane - * requeuing of a request. - */ -static request_restarter_t request_restarter = NULL; - -/**********************************************************************/ -void restart_request(struct uds_request *request) -{ - request->requeued = true; - if (request_restarter == NULL) { - enqueue_request(request, STAGE_INDEX); - } else { - request_restarter(request); - } -} - -/**********************************************************************/ -void set_request_restarter(request_restarter_t restarter) -{ - request_restarter = restarter; -} - -/**********************************************************************/ -static INLINE void increment_once(uint64_t *count_ptr) -{ - WRITE_ONCE(*count_ptr, READ_ONCE(*count_ptr) + 1); -} - -/**********************************************************************/ -void update_request_context_stats(struct uds_request *request) -{ - /* - * We don't need any synchronization since the context stats are only - * modified from the single callback thread. - * - * We increment either 2 or 3 counters in this method. - * - * XXX We always increment the "requests" counter. But there is no - * code that uses the value stored in this counter. - * - * We always increment exactly one of these counters (unless there is - * an error in the code, which never happens): postsFound postsNotFound - * updatesFound updatesNotFound - * deletionsFound deletionsNotFound - * queriesFound queriesNotFound - * - * XXX In the case of post request that were found in the index, we - * increment exactly one of these counters. But there is no code that - * uses the value stored in these counters. inMemoryPostsFound - * densePostsFound - * sparsePostsFound - */ - - struct session_stats *session_stats = &request->session->stats; - - bool found = (request->location != UDS_LOCATION_UNAVAILABLE); - increment_once(&session_stats->requests); - - switch (request->type) { - case UDS_POST: - if (found) { - increment_once(&session_stats->posts_found); - - if (request->location == UDS_LOCATION_IN_OPEN_CHAPTER) { - increment_once(&session_stats->posts_found_open_chapter); - } else if (request->location == UDS_LOCATION_IN_DENSE) { - increment_once(&session_stats->posts_found_dense); - } else if (request->location == UDS_LOCATION_IN_SPARSE) { - increment_once(&session_stats->posts_found_sparse); - } - } else { - increment_once(&session_stats->posts_not_found); - } - break; - - case UDS_UPDATE: - if (found) { - increment_once(&session_stats->updates_found); - } else { - increment_once(&session_stats->updates_not_found); - } - break; - - case UDS_DELETE: - if (found) { - increment_once(&session_stats->deletions_found); - } else { - increment_once(&session_stats->deletions_not_found); - } - break; - - case UDS_QUERY: - if (found) { - increment_once(&session_stats->queries_found); - } else { - increment_once(&session_stats->queries_not_found); - } - break; - - default: - request->status = ASSERT(false, - "unknown request type: %d", - request->type); - } -} - -/**********************************************************************/ -void enter_callback_stage(struct uds_request *request) -{ - if (request->status != UDS_SUCCESS) { - // All request errors are considered unrecoverable - disable_index_session(request->session); - } - - request->status = uds_map_to_system_error(request->status); - // Handle asynchronous client callbacks in the designated thread. - enqueue_request(request, STAGE_CALLBACK); -} diff --git a/uds/request.h b/uds/request.h deleted file mode 100644 index d71fb1e4..00000000 --- a/uds/request.h +++ /dev/null @@ -1,123 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/krusty/src/uds/request.h#21 $ - */ - -#ifndef REQUEST_H -#define REQUEST_H - -#include "cacheCounters.h" -#include "common.h" -#include "compiler.h" -#include "uds-threads.h" -#include "timeUtils.h" -#include "uds.h" - -/** - * Abstract request pipeline stages, which can also be viewed as stages in the - * life-cycle of a request. - **/ -enum request_stage { - STAGE_TRIAGE, - STAGE_INDEX, - STAGE_CALLBACK, - STAGE_MESSAGE, -}; - -typedef void (*request_restarter_t)(struct uds_request *); - -/** - * Make an asynchronous control message for an index zone and enqueue it for - * processing. - * - * @param message The message to send - * @param zone The zone number of the zone to receive the message - * @param index The index responsible for handling the message - * - * @return UDS_SUCCESS or an error code - **/ -int __must_check launch_zone_message(struct uds_zone_message message, - unsigned int zone, - struct uds_index *index); - -/** - * Enqueue a request for the next stage of the pipeline. If there is more than - * one possible queue for a stage, this function uses the request to decide - * which queue should handle it. - * - * @param request The request to enqueue - * @param next_stage The next stage of the pipeline to process the request - **/ -void enqueue_request(struct uds_request *request, - enum request_stage next_stage); - -/** - * A method to restart delayed requests. - * - * @param request The request to restart - **/ -void restart_request(struct uds_request *request); - -/** - * Set the function pointer which is used to restart requests. - * This is needed by albserver code and is used as a test hook by the unit - * tests. - * - * @param restarter The function to call to restart requests. - **/ -void set_request_restarter(request_restarter_t restarter); - -/** - * Enter the callback stage of processing for a request, notifying the waiting - * thread if the request is synchronous, freeing the request if it is an - * asynchronous control message, or placing it on the callback queue if it is - * an asynchronous client request. - * - * @param request the request which has completed execution - **/ -void enter_callback_stage(struct uds_request *request); - -/** - * Update the context statistics to reflect the successful completion of a - * client request. - * - * @param request a client request that has successfully completed execution - **/ -void update_request_context_stats(struct uds_request *request); - -/** - * Compute the cache_probe_type value reflecting the request and page type. - * - * @param request The request being processed, or NULL - * @param is_index_page Whether the cache probe will be for an index page - * - * @return the cache probe type enumeration - **/ -static INLINE enum cache_probe_type -cache_probe_type(struct uds_request *request, bool is_index_page) -{ - if ((request != NULL) && request->requeued) { - return is_index_page ? CACHE_PROBE_INDEX_RETRY : - CACHE_PROBE_RECORD_RETRY; - } else { - return is_index_page ? CACHE_PROBE_INDEX_FIRST : - CACHE_PROBE_RECORD_FIRST; - } -} -#endif /* REQUEST_H */ diff --git a/uds/searchList.c b/uds/searchList.c deleted file mode 100644 index b0f18dba..00000000 --- a/uds/searchList.c +++ /dev/null @@ -1,115 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/krusty/src/uds/searchList.c#15 $ - */ - -#include "searchList.h" - -#include "errors.h" -#include "logger.h" -#include "memoryAlloc.h" - -/**********************************************************************/ -int make_search_list(unsigned int capacity, struct search_list **list_ptr) -{ - struct search_list *list; - unsigned int bytes; - uint8_t i; - int result; - if (capacity == 0) { - return uds_log_error_strerror(UDS_INVALID_ARGUMENT, - "search list must have entries"); - } - if (capacity > UINT8_MAX) { - return uds_log_error_strerror(UDS_INVALID_ARGUMENT, - "search list capacity must fit in 8 bits"); - } - - // We need three temporary entry arrays for purge_search_list(). - // Allocate them contiguously with the main array. - bytes = sizeof(struct search_list) + (4 * capacity * sizeof(uint8_t)); - result = uds_allocate_cache_aligned(bytes, "search list", &list); - if (result != UDS_SUCCESS) { - return result; - } - - list->capacity = capacity; - list->first_dead_entry = 0; - - // Fill in the indexes of the chapter index cache entries. These will - // be only ever be permuted as the search list is used. - for (i = 0; i < capacity; i++) { - list->entries[i] = i; - } - - *list_ptr = list; - return UDS_SUCCESS; -} - -/**********************************************************************/ -void purge_search_list(struct search_list *search_list, - const struct cached_chapter_index chapters[], - uint64_t oldest_virtual_chapter) -{ - uint8_t *entries, *alive, *skipped, *dead; - unsigned int next_alive, next_skipped, next_dead; - int i; - - if (search_list->first_dead_entry == 0) { - // There are no live entries in the list to purge. - return; - } - - /* - * Partition the previously-alive entries in the list into three - * temporary lists, keeping the current LRU search order within each - * list. The element array was allocated with enough space for all four - * lists. - */ - entries = &search_list->entries[0]; - alive = &entries[search_list->capacity]; - skipped = &alive[search_list->capacity]; - dead = &skipped[search_list->capacity]; - next_alive = next_skipped = next_dead = 0; - - for (i = 0; i < search_list->first_dead_entry; i++) { - uint8_t entry = entries[i]; - const struct cached_chapter_index *chapter = &chapters[entry]; - if ((chapter->virtual_chapter < oldest_virtual_chapter) || - (chapter->virtual_chapter == UINT64_MAX)) { - dead[next_dead++] = entry; - } else if (chapter->skip_search) { - skipped[next_skipped++] = entry; - } else { - alive[next_alive++] = entry; - } - } - - // Copy the temporary lists back to the search list so we wind up with - // [ alive, alive, skippable, new-dead, new-dead, old-dead, old-dead ] - memcpy(entries, alive, next_alive); - entries += next_alive; - - memcpy(entries, skipped, next_skipped); - entries += next_skipped; - - memcpy(entries, dead, next_dead); - // The first dead entry is now the start of the copied dead list. - search_list->first_dead_entry = (next_alive + next_skipped); -} diff --git a/uds/searchList.h b/uds/searchList.h deleted file mode 100644 index 8aa48b72..00000000 --- a/uds/searchList.h +++ /dev/null @@ -1,213 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/krusty/src/uds/searchList.h#9 $ - */ - -#ifndef SEARCH_LIST_H -#define SEARCH_LIST_H - -#include "cachedChapterIndex.h" -#include "compiler.h" -#include "stringUtils.h" -#include "typeDefs.h" - -/** - * A search_list represents the permutations of the sparse chapter index cache - * entry array. Those permutations express an ordering on the chapter indexes, - * from most recently accessed to least recently accessed, which is the order - * in which the indexes should be searched and the reverse order in which they - * should be evicted from the cache (LRU cache replacement policy). - * - * Cache entries that are dead (virtual_chapter == UINT64_MAX) are kept as a - * suffix of the list, avoiding the need to even iterate over them to search, - * and ensuring that dead entries are replaced before any live entries are - * evicted. - * - * The search list is intended to be instantated for each zone thread, - * avoiding any need for synchronization. The structure is allocated on a - * cache boundary to avoid false sharing of memory cache lines between zone - * threads. - **/ -struct search_list { - /** The number of cached chapter indexes and search list entries */ - uint8_t capacity; - - /** The index in the entries array of the first dead cache entry */ - uint8_t first_dead_entry; - - /** The chapter array indexes representing the chapter search order */ - uint8_t entries[]; -}; - -/** - * search_list_iterator captures the fields needed to iterate over the live - * entries in a search list and return the struct cached_chapter_index pointers - * that the search code actually wants to deal with. - **/ -struct search_list_iterator { - /** The search list defining the chapter search iteration order */ - struct search_list *list; - - /** The index of the next entry to return from the search list */ - unsigned int next_entry; - - /** The cached chapters that are referenced by the search list */ - struct cached_chapter_index *chapters; -}; - -/** - * Allocate and initialize a new chapter cache search list with the same - * capacity as the cache. The index of each entry in the cache will appear - * exactly once in the array. All the chapters in the cache are assumed to be - * initially dead, so first_dead_entry will be zero and no chapters will be - * returned when the search list is iterated. - * - * @param [in] capacity the number of entries in the search list - * @param [out] list_ptr a pointer in which to return the new search list - **/ -int __must_check make_search_list(unsigned int capacity, - struct search_list **list_ptr); - -/** - * Copy the contents of one search list to another. - * - * @param source the list to copy - * @param target the list to replace - **/ -static INLINE void copy_search_list(const struct search_list *source, - struct search_list *target) -{ - *target = *source; - memcpy(target->entries, source->entries, source->capacity); -} - -/** - * Prepare to iterate over the live cache entries a search list. - * - * @param list the list defining the live chapters and the search order - * @param chapters the chapter index entries to return from get_next_chapter() - * - * @return an iterator positioned at the start of the search list - **/ -static INLINE struct search_list_iterator -iterate_search_list(struct search_list *list, - struct cached_chapter_index chapters[]) -{ - struct search_list_iterator iterator = { - .list = list, - .next_entry = 0, - .chapters = chapters, - }; - return iterator; -} - -/** - * Check if the search list iterator has another entry to return. - * - * @param iterator the search list iterator - * - * @return true if get_next_chapter() may be called - **/ -static INLINE bool -has_next_chapter(const struct search_list_iterator *iterator) -{ - return (iterator->next_entry < iterator->list->first_dead_entry); -} - -/** - * Return a pointer to the next live chapter in the search list iteration and - * advance the iterator. This must only be called when has_next_chapter() - * returns true. - * - * @param iterator the search list iterator - * - * @return a pointer to the next live chapter index in the search list order - **/ -static INLINE struct cached_chapter_index * -get_next_chapter(struct search_list_iterator *iterator) -{ - return &iterator->chapters[iterator->list - ->entries[iterator->next_entry++]]; -} - -/** - * Rotate the pointers in a prefix of a search list downwards by one item, - * pushing elements deeper into the list and moving a new chapter to the start - * of the search list. This is the "make most recent" operation on the search - * list. - * - * If the search list provided is [ 0 1 2 3 4 ] and the prefix - * length is 4, then 3 is being moved to the front. - * The search list after the call will be [ 3 0 1 2 4 ] and the - * function will return 3. - * - * @param search_list the chapter index search list to rotate - * @param prefix_length the length of the prefix of the list to rotate - * - * @return the array index of the chapter cache entry that is now at the front - * of the search list - **/ -static INLINE uint8_t rotate_search_list(struct search_list *search_list, - uint8_t prefix_length) -{ - // Grab the value of the last entry in the list prefix. - uint8_t most_recent = search_list->entries[prefix_length - 1]; - - if (prefix_length > 1) { - // Push the first N-1 entries down by one entry, overwriting - // the entry we just grabbed. - memmove(&search_list->entries[1], - &search_list->entries[0], - prefix_length - 1); - - // We now have a hole at the front of the list in which we can - // place the rotated entry. - search_list->entries[0] = most_recent; - } - - // This function is also used to move a dead chapter to the front of - // the list, in which case the suffix of dead chapters was pushed down - // too. - if (search_list->first_dead_entry < prefix_length) { - search_list->first_dead_entry += 1; - } - - return most_recent; -} - -/** - * Purge invalid cache entries, marking them as dead and moving them to the - * end of the search list, then push any chapters that have skip_search set - * down so they follow all the remaining live, valid chapters in the search - * list. This effectively sorts the search list into three regions--active, - * skippable, and dead--while maintaining the LRU ordering that already - * existed (a stable sort). - * - * This operation must only be called during the critical section in - * update_sparse_cache() since it effectively changes cache membership. - * - * @param search_list the chapter index search list to purge - * @param chapters the chapter index cache entries - * @param oldest_virtual_chapter the oldest virtual chapter - **/ -void purge_search_list(struct search_list *search_list, - const struct cached_chapter_index chapters[], - uint64_t oldest_virtual_chapter); - -#endif /* SEARCH_LIST_H */ diff --git a/uds/sparseCache.c b/uds/sparseCache.c deleted file mode 100644 index 1744bf27..00000000 --- a/uds/sparseCache.c +++ /dev/null @@ -1,584 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/krusty/src/uds/sparseCache.c#36 $ - */ - -/** - * The sparse chapter index cache is implemented as a simple array of cache - * entries. Since the cache is small (seven chapters by default), searching - * for a specific virtual chapter is implemented as a linear search. The cache - * replacement policy is least-recently-used (LRU). Again, size of the cache - * allows the LRU order to be maintained by shifting entries in an array list. - * - * The most important property of this cache is the absence of synchronization - * for read operations. Safe concurrent access to the cache by the zone - * threads is controlled by the triage queue and the barrier requests it - * issues to the zone queues. The set of cached chapters does not and must not - * change between the carefully coordinated calls to update_sparse_cache() from - * the zone threads. - * - * The critical invariant for that coordination is the cache membership must - * not change between those updates; the calls to sparse_cache_contains() from - * the zone threads must all receive the same results for any virtual chapter - * number. To ensure that critical invariant, state changes such as "that - * virtual chapter is no longer in the volume" and "skip searching that - * chapter because it has had too many cache misses" are represented - * separately from the cache membership information (the virtual chapter - * number). - * - * As a result of this invariant, we have the guarantee that every zone thread - * will call update_sparse_cache() once and exactly once to request a chapter - * that is not in the cache, and the serialization of the barrier requests - * from the triage queue ensures they will all request the same chapter - * number. This means the only synchronization we need can be provided by a - * pair of thread barriers used only in the update_sparse_cache() call, - * providing a critical section where a single zone thread can drive the cache - * update while all the other zone threads are known to be blocked, waiting in - * the second barrier. Outside that critical section, all the zone threads - * implicitly hold a shared lock. Inside it, the "captain" (the thread that - * was uniquely flagged when passing through the first barrier) holds an - * exclusive lock. No other threads may access or modify the cache, except for - * accessing cache statistics and similar queries. - * - * Cache statistics must only be modified by a single thread, conventionally - * the zone zero thread. All fields that might be frequently updated by that - * thread are kept in separate cache-aligned structures so they will not cause - * cache contention via "false sharing" with the fields that are frequently - * accessed by all of the zone threads. - * - * LRU order is kept independently by each zone thread, and each zone uses its - * own list for searching and cache membership queries. The zone zero list is - * used to decide which chapter to evict when the cache is updated, and its - * search list is copied to the other threads at that time. - * - * The virtual chapter number field of the cache entry is the single field - * indicating whether a chapter is a member of the cache or not. The value - * UINT64_MAX is used to represent a null, undefined, or wildcard - * chapter number. When present in the virtual chapter number field - * cached_chapter_index, it indicates that the cache entry is dead, and all - * the other fields of that entry (other than immutable pointers to cache - * memory) are undefined and irrelevant. Any cache entry that is not marked as - * dead is fully defined and a member of the cache--sparse_cache_contains() - * must always return true for any virtual chapter number that appears in any - * of the cache entries. - * - * A chapter index that is a member of the cache may be marked for different - * treatment (disabling search) between calls to update_sparse_cache() in two - * different ways. When a chapter falls off the end of the volume, its virtual - * chapter number will be less that the oldest virtual chapter number. Since - * that chapter is no longer part of the volume, there's no point in continuing - * to search that chapter index. Once invalidated, that virtual chapter will - * still be considered a member of the cache, but it will no longer be searched - * for matching chunk names. - * - * The second mechanism for disabling search is the heuristic based on keeping - * track of the number of consecutive search misses in a given chapter index. - * Once that count exceeds a threshold, the skip_search flag will be set to - * true, causing the chapter to be skipped in the fallback search of the - * entire cache, but still allowing it to be found when searching for a hook - * in that specific chapter. Finding a hook will clear the skip_search flag, - * once again allowing the non-hook searches to use the cache entry. Again, - * regardless of the state of the skip_search flag, the virtual chapter must - * still considered to be a member of the cache for sparse_cache_contains(). - * - * Barrier requests and the sparse chapter index cache are also described in - * - * https://intranet.permabit.com/wiki/Chapter_Index_Cache_supports_concurrent_access - * - * and in a message to the albireo mailing list on 5/28/2011 titled "true - * barriers with a hook resolution queue". - **/ - -#include "sparseCache.h" - -#include "cachedChapterIndex.h" -#include "chapterIndex.h" -#include "common.h" -#include "index.h" -#include "logger.h" -#include "memoryAlloc.h" -#include "permassert.h" -#include "searchList.h" -#include "uds-threads.h" -#include "zone.h" - -enum { - /** The number of consecutive search misses that will disable searching - */ - SKIP_SEARCH_THRESHOLD = 20000, - - /** a named constant to use when identifying zone zero */ - ZONE_ZERO = 0 -}; - -/** - * These counter values are essentially fields of the sparse_cache, but are - * segregated into this structure because they are frequently modified. We - * group them and align them to keep them on different cache lines from the - * cache fields that are accessed far more often than they are updated. - **/ -struct sparse_cache_counters { - /** the total number of virtual chapter probes that succeeded */ - uint64_t chapter_hits; - - /** the total number of virtual chapter probes that failed */ - uint64_t chapter_misses; - - /** the total number of cache searches that found a possible match */ - uint64_t search_hits; - - /** the total number of cache searches that found no matches */ - uint64_t search_misses; - - /** the number of cache entries that fell off the end of the volume */ - uint64_t invalidations; - - /** the number of cache entries that were evicted while still valid */ - uint64_t evictions; -} __attribute__((aligned(CACHE_LINE_BYTES))); - -/** - * This is the private structure definition of a sparse_cache. - **/ -struct sparse_cache { - /** the number of cache entries, which is the size of the chapters - * array */ - unsigned int capacity; - - /** the number of zone threads using the cache */ - unsigned int zone_count; - - /** the geometry governing the volume */ - const struct geometry *geometry; - - /** the number of search misses in zone zero that will disable - * searching */ - unsigned int skip_search_threshold; - - /** pointers to the cache-aligned chapter search order for each zone */ - struct search_list *search_lists[MAX_ZONES]; - - /** the thread barriers used to synchronize the zone threads for update - */ - struct barrier begin_cache_update; - struct barrier end_cache_update; - - /** frequently-updated counter fields (cache-aligned) */ - struct sparse_cache_counters counters; - - /** the counted array of chapter index cache entries (cache-aligned) */ - struct cached_chapter_index chapters[]; -}; - -/** - * Initialize a sparse chapter index cache. - * - * @param cache the sparse cache to initialize - * @param geometry the geometry governing the volume - * @param capacity the number of chapters the cache will hold - * @param zone_count the number of zone threads using the cache - * - * @return UDS_SUCCESS or an error code - **/ -static int __must_check initialize_sparse_cache(struct sparse_cache *cache, - const struct geometry *geometry, - unsigned int capacity, - unsigned int zone_count) -{ - unsigned int i; - int result; - - cache->geometry = geometry; - cache->capacity = capacity; - cache->zone_count = zone_count; - - // Scale down the skip threshold by the number of zones since we count - // the chapter search misses only in zone zero. - cache->skip_search_threshold = (SKIP_SEARCH_THRESHOLD / zone_count); - - result = uds_initialize_barrier(&cache->begin_cache_update, zone_count); - if (result != UDS_SUCCESS) { - return result; - } - result = uds_initialize_barrier(&cache->end_cache_update, zone_count); - if (result != UDS_SUCCESS) { - return result; - } - for (i = 0; i < capacity; i++) { - result = initialize_cached_chapter_index(&cache->chapters[i], - geometry); - if (result != UDS_SUCCESS) { - return result; - } - } - - // Allocate each zone's independent LRU order. - for (i = 0; i < zone_count; i++) { - result = make_search_list(capacity, &cache->search_lists[i]); - if (result != UDS_SUCCESS) { - return result; - } - } - return UDS_SUCCESS; -} - -/**********************************************************************/ -int make_sparse_cache(const struct geometry *geometry, - unsigned int capacity, - unsigned int zone_count, - struct sparse_cache **cache_ptr) -{ - unsigned int bytes = - (sizeof(struct sparse_cache) + - (capacity * sizeof(struct cached_chapter_index))); - - struct sparse_cache *cache; - int result = uds_allocate_cache_aligned(bytes, "sparse cache", &cache); - if (result != UDS_SUCCESS) { - return result; - } - - result = - initialize_sparse_cache(cache, geometry, capacity, zone_count); - if (result != UDS_SUCCESS) { - free_sparse_cache(cache); - return result; - } - - *cache_ptr = cache; - return UDS_SUCCESS; -} - -/**********************************************************************/ -size_t get_sparse_cache_memory_size(const struct sparse_cache *cache) -{ - // Count the delta_index_page as cache memory, but ignore all other - // overhead. - size_t page_size = (sizeof(struct delta_index_page) + - cache->geometry->bytes_per_page); - size_t chapter_size = - (page_size * cache->geometry->index_pages_per_chapter); - return (cache->capacity * chapter_size); -} - -/** - * Update counters to reflect a chapter access hit and clear the skip_search - * flag on the chapter, if set. - * - * @param cache the cache to update - * @param chapter the cache entry to update - **/ -static void score_chapter_hit(struct sparse_cache *cache, - struct cached_chapter_index *chapter) -{ - cache->counters.chapter_hits += 1; - set_skip_search(chapter, false); -} - -/** - * Update counters to reflect a chapter access miss. - * - * @param cache the cache to update - **/ -static void score_chapter_miss(struct sparse_cache *cache) -{ - cache->counters.chapter_misses += 1; -} - -/** - * Check if the cache entry that is about to be replaced is already dead, and - * if it's not, add to tally of evicted or invalidated cache entries. - * - * @param zone the zone used to find the oldest chapter - * @param cache the cache to update - * @param chapter the cache entry about to be replaced - **/ -static void score_eviction(struct index_zone *zone, - struct sparse_cache *cache, - struct cached_chapter_index *chapter) -{ - if (chapter->virtual_chapter == UINT64_MAX) { - return; - } - if (chapter->virtual_chapter < zone->oldest_virtual_chapter) { - cache->counters.invalidations += 1; - } else { - cache->counters.evictions += 1; - } -} - -/** - * Update counters to reflect a cache search hit. This bumps the hit - * count, clears the miss count, and clears the skip_search flag. - * - * @param cache the cache to update - * @param chapter the cache entry to update - **/ -static void score_search_hit(struct sparse_cache *cache, - struct cached_chapter_index *chapter) -{ - cache->counters.search_hits += 1; - chapter->counters.search_hits += 1; - chapter->counters.consecutive_misses = 0; - set_skip_search(chapter, false); -} - -/** - * Update counters to reflect a cache search miss. This bumps the consecutive - * miss count, and if it goes over skip_search_threshold, sets the skip_search - * flag on the chapter. - * - * @param cache the cache to update - * @param chapter the cache entry to update - **/ -static void score_search_miss(struct sparse_cache *cache, - struct cached_chapter_index *chapter) -{ - cache->counters.search_misses += 1; - chapter->counters.search_misses += 1; - chapter->counters.consecutive_misses += 1; - if (chapter->counters.consecutive_misses > - cache->skip_search_threshold) { - set_skip_search(chapter, true); - } -} - -/**********************************************************************/ -void free_sparse_cache(struct sparse_cache *cache) -{ - unsigned int i; - if (cache == NULL) { - return; - } - - for (i = 0; i < cache->zone_count; i++) { - UDS_FREE(UDS_FORGET(cache->search_lists[i])); - } - - for (i = 0; i < cache->capacity; i++) { - struct cached_chapter_index *chapter = &cache->chapters[i]; - destroy_cached_chapter_index(chapter); - } - - uds_destroy_barrier(&cache->begin_cache_update); - uds_destroy_barrier(&cache->end_cache_update); - UDS_FREE(cache); -} - - -/**********************************************************************/ -bool sparse_cache_contains(struct sparse_cache *cache, - uint64_t virtual_chapter, - unsigned int zone_number) -{ - /* - * The correctness of the barriers depends on the invariant that - * between calls to update_sparse_cache(), the answers this function - * returns must never vary--the result for a given chapter must be - * identical across zones. That invariant must be maintained even if - * the chapter falls off the end of the volume, or if searching it is - * disabled because of too many search misses. - */ - - // Get the chapter search order for this zone thread. - struct search_list_iterator iterator = - iterate_search_list(cache->search_lists[zone_number], - cache->chapters); - while (has_next_chapter(&iterator)) { - struct cached_chapter_index *chapter = - get_next_chapter(&iterator); - if (virtual_chapter == chapter->virtual_chapter) { - if (zone_number == ZONE_ZERO) { - score_chapter_hit(cache, chapter); - } - - // Move the chapter to the front of the search list. - rotate_search_list(iterator.list, iterator.next_entry); - return true; - } - } - - // The specified virtual chapter isn't cached. - if (zone_number == ZONE_ZERO) { - score_chapter_miss(cache); - } - return false; -} - -/**********************************************************************/ -int update_sparse_cache(struct index_zone *zone, uint64_t virtual_chapter) -{ - int result = UDS_SUCCESS; - const struct uds_index *index = zone->index; - struct sparse_cache *cache = index->volume->sparse_cache; - - // If the chapter is already in the cache, we don't need to do a thing - // except update the search list order, which this check does. - if (sparse_cache_contains(cache, virtual_chapter, zone->id)) { - return UDS_SUCCESS; - } - - // Wait for every zone thread to have reached its corresponding barrier - // request and invoked this function before starting to modify the - // cache. - uds_enter_barrier(&cache->begin_cache_update, NULL); - - /* - * This is the start of the critical section: the zone zero thread is - * captain, effectively holding an exclusive lock on the sparse cache. - * All the other zone threads must do nothing between the two barriers. - * They will wait at the end_cache_update barrier for the captain to - * finish the update. - */ - - if (zone->id == ZONE_ZERO) { - unsigned int z; - // Purge invalid chapters from the LRU search list. - struct search_list *zone_zero_list = - cache->search_lists[ZONE_ZERO]; - purge_search_list(zone_zero_list, - cache->chapters, - zone->oldest_virtual_chapter); - - // First check that the desired chapter is still in the volume. - // If it's not, the hook fell out of the index and there's - // nothing to do for it. - if (virtual_chapter >= index->oldest_virtual_chapter) { - // Evict the least recently used live chapter, or - // replace a dead cache entry, all by rotating the the - // last list entry to the front. - struct cached_chapter_index *victim = - &cache->chapters[rotate_search_list(zone_zero_list, - cache->capacity)]; - - // Check if the victim is already dead, and if it's - // not, add to the tally of evicted or invalidated - // cache entries. - score_eviction(zone, cache, victim); - - // Read the index page bytes and initialize the page - // array. - result = cache_chapter_index(victim, virtual_chapter, - index->volume); - } - - // Copy the new search list state to all the other zone threads - // so they'll get the result of pruning and see the new - // chapter. - for (z = 1; z < cache->zone_count; z++) { - copy_search_list(zone_zero_list, - cache->search_lists[z]); - } - } - - // This is the end of the critical section. All cache invariants must - // have been restored--it will be shared/read-only again beyond the - // barrier. - - uds_enter_barrier(&cache->end_cache_update, NULL); - return result; -} - -/**********************************************************************/ -void invalidate_sparse_cache(struct sparse_cache *cache) -{ - unsigned int i; - if (cache == NULL) { - return; - } - for (i = 0; i < cache->capacity; i++) { - struct cached_chapter_index *chapter = &cache->chapters[i]; - chapter->virtual_chapter = UINT64_MAX; - release_cached_chapter_index(chapter); - } -} - -/**********************************************************************/ -int search_sparse_cache(struct index_zone *zone, - const struct uds_chunk_name *name, - uint64_t *virtual_chapter_ptr, - int *record_page_ptr) -{ - struct volume *volume = zone->index->volume; - struct sparse_cache *cache = volume->sparse_cache; - unsigned int zone_number = zone->id; - // If the caller did not specify a virtual chapter, search the entire - // cache. - bool search_all = (*virtual_chapter_ptr == UINT64_MAX); - unsigned int chapters_searched = 0; - - // Get the chapter search order for this zone thread, searching the - // chapters from most recently hit to least recently hit. - struct search_list_iterator iterator = - iterate_search_list(cache->search_lists[zone_number], - cache->chapters); - while (has_next_chapter(&iterator)) { - int result; - struct cached_chapter_index *chapter = - get_next_chapter(&iterator); - - // Skip chapters no longer cached, or that have too many search - // misses. - if (should_skip_chapter_index(zone, chapter, - *virtual_chapter_ptr)) { - continue; - } - - result = search_cached_chapter_index(chapter, - cache->geometry, - volume->index_page_map, - name, - record_page_ptr); - if (result != UDS_SUCCESS) { - return result; - } - chapters_searched += 1; - - // Did we find an index entry for the name? - if (*record_page_ptr != NO_CHAPTER_INDEX_ENTRY) { - if (zone_number == ZONE_ZERO) { - score_search_hit(cache, chapter); - } - - // Move the chapter to the front of the search list. - rotate_search_list(iterator.list, iterator.next_entry); - - // Return a matching entry as soon as it is found. It - // might be a false collision that has a true match in - // another chapter, but that's a very rare case and not - // worth the extra search cost or complexity. - *virtual_chapter_ptr = chapter->virtual_chapter; - return UDS_SUCCESS; - } - - if (zone_number == ZONE_ZERO) { - score_search_miss(cache, chapter); - } - - if (!search_all) { - // We just searched the virtual chapter the caller - // specified and there was no match, so we're done. - break; - } - } - - // The name was not found in the cache. - *record_page_ptr = NO_CHAPTER_INDEX_ENTRY; - return UDS_SUCCESS; -} diff --git a/uds/stringLinuxKernel.c b/uds/stringLinuxKernel.c deleted file mode 100644 index 85a10e37..00000000 --- a/uds/stringLinuxKernel.c +++ /dev/null @@ -1,77 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/krusty/kernelLinux/uds/stringLinuxKernel.c#8 $ - */ - -#include - -#include "errors.h" -#include "logger.h" -#include "stringUtils.h" - -/**********************************************************************/ -int uds_string_to_signed_long(const char *nptr, long *num) -{ - while (*nptr == ' ') { - nptr++; - } - return kstrtol(nptr, 10, num) ? UDS_INVALID_ARGUMENT : UDS_SUCCESS; -} - -/**********************************************************************/ -int uds_string_to_unsigned_long(const char *nptr, unsigned long *num) -{ - while (*nptr == ' ') { - nptr++; - } - if (*nptr == '+') { - nptr++; - } - return kstrtoul(nptr, 10, num) ? UDS_INVALID_ARGUMENT : UDS_SUCCESS; -} - -/**********************************************************************/ -char *uds_next_token(char *str, const char *delims, char **state) -{ - char *ep, *sp = str ? str : *state; - while (*sp && strchr(delims, *sp)) { - ++sp; - } - if (!*sp) { - return NULL; - } - ep = sp; - while (*ep && !strchr(delims, *ep)) { - ++ep; - } - if (*ep) { - *ep++ = '\0'; - } - *state = ep; - return sp; -} - -/**********************************************************************/ -int uds_parse_uint64(const char *str, uint64_t *num) -{ - unsigned long value = *num; - int result = uds_string_to_unsigned_long(str, &value); - *num = value; - return result; -} diff --git a/uds/stringUtils.c b/uds/stringUtils.c deleted file mode 100644 index 7b7140b7..00000000 --- a/uds/stringUtils.c +++ /dev/null @@ -1,160 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/krusty/src/uds/stringUtils.c#16 $ - */ - -#include "stringUtils.h" - -#include "errors.h" -#include "logger.h" -#include "memoryAlloc.h" -#include "permassert.h" -#include "uds.h" - -/**********************************************************************/ -int uds_alloc_sprintf(const char *what, char **strp, const char *fmt, ...) -{ - va_list args; - int result; - int count; - if (strp == NULL) { - return UDS_INVALID_ARGUMENT; - } - // We want the memory allocation to use our own UDS_ALLOCATE/UDS_FREE - // wrappers. - va_start(args, fmt); - count = vsnprintf(NULL, 0, fmt, args) + 1; - va_end(args); - result = UDS_ALLOCATE(count, char, what, strp); - if (result == UDS_SUCCESS) { - va_start(args, fmt); - vsnprintf(*strp, count, fmt, args); - va_end(args); - } - if ((result != UDS_SUCCESS) && (what != NULL)) { - uds_log_error("cannot allocate %s", what); - } - return result; -} - -/**********************************************************************/ -int uds_wrap_vsnprintf(const char *what, - char *buf, - size_t buf_size, - int error, - const char *fmt, - va_list ap, - size_t *needed) -{ - int n; - if (buf == NULL) { - static char nobuf[1]; - buf = nobuf; - buf_size = 0; - } - n = vsnprintf(buf, buf_size, fmt, ap); - if (n < 0) { - return uds_log_error_strerror(UDS_UNEXPECTED_RESULT, - "%s: vsnprintf failed", what); - } - if (needed) { - *needed = n; - } - if (((size_t) n >= buf_size) && (buf != NULL) && - (error != UDS_SUCCESS)) { - return uds_log_error_strerror(error, - "%s: string too long", what); - } - return UDS_SUCCESS; -} - -/**********************************************************************/ -int uds_fixed_sprintf(const char *what, - char *buf, - size_t buf_size, - int error, - const char *fmt, - ...) -{ - va_list args; - int result; - if (buf == NULL) { - return UDS_INVALID_ARGUMENT; - } - va_start(args, fmt); - result = uds_wrap_vsnprintf(what, buf, buf_size, error, fmt, args, - NULL); - va_end(args); - return result; -} - -/**********************************************************************/ -char *uds_v_append_to_buffer(char *buffer, char *buf_end, const char *fmt, - va_list args) -{ - size_t n = vsnprintf(buffer, buf_end - buffer, fmt, args); - if (n >= (size_t)(buf_end - buffer)) { - buffer = buf_end; - } else { - buffer += n; - } - return buffer; -} - -/**********************************************************************/ -char *uds_append_to_buffer(char *buffer, char *buf_end, const char *fmt, ...) -{ - va_list ap; - char *pos; - - va_start(ap, fmt); - pos = uds_v_append_to_buffer(buffer, buf_end, fmt, ap); - va_end(ap); - return pos; -} - -/**********************************************************************/ -int uds_string_to_signed_int(const char *nptr, int *num) -{ - long value; - int result = uds_string_to_signed_long(nptr, &value); - if (result != UDS_SUCCESS) { - return result; - } - if ((value < INT_MIN) || (value > INT_MAX)) { - return ERANGE; - } - *num = (int) value; - return UDS_SUCCESS; -} - -/**********************************************************************/ -int uds_string_to_unsigned_int(const char *nptr, unsigned int *num) -{ - unsigned long value; - int result = uds_string_to_unsigned_long(nptr, &value); - if (result != UDS_SUCCESS) { - return result; - } - if (value > UINT_MAX) { - return ERANGE; - } - *num = (unsigned int) value; - return UDS_SUCCESS; -} diff --git a/uds/sysfs.h b/uds/sysfs.h deleted file mode 100644 index 304d0328..00000000 --- a/uds/sysfs.h +++ /dev/null @@ -1,39 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/krusty/kernelLinux/uds/sysfs.h#4 $ - */ - -#ifndef SYSFS_H -#define SYSFS_H - -/** - * Called when the module is loaded to initialize the /sys/\ - * tree. - * - * @return 0 on success, or non-zero on error - **/ -int init_uds_sysfs(void); - -/** - * Called when the module is being unloaded to terminate the - * /sys/\ tree. - **/ -void put_uds_sysfs(void); - -#endif /* SYSFS_H */ diff --git a/uds/threadCondVarLinuxKernel.c b/uds/threadCondVarLinuxKernel.c deleted file mode 100644 index 49c484b2..00000000 --- a/uds/threadCondVarLinuxKernel.c +++ /dev/null @@ -1,76 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/krusty/kernelLinux/uds/threadCondVarLinuxKernel.c#15 $ - */ - -#include "errors.h" -#include "timeUtils.h" -#include "uds-threads.h" - -/**********************************************************************/ -int uds_init_cond(struct cond_var *cv) -{ - cv->event_count = NULL; - return make_event_count(&cv->event_count); -} - -/**********************************************************************/ -int uds_signal_cond(struct cond_var *cv) -{ - event_count_broadcast(cv->event_count); - return UDS_SUCCESS; -} - -/**********************************************************************/ -int uds_broadcast_cond(struct cond_var *cv) -{ - event_count_broadcast(cv->event_count); - return UDS_SUCCESS; -} - -/**********************************************************************/ -int uds_wait_cond(struct cond_var *cv, struct mutex *mutex) -{ - event_token_t token = event_count_prepare(cv->event_count); - uds_unlock_mutex(mutex); - event_count_wait(cv->event_count, token, NULL); - uds_lock_mutex(mutex); - return UDS_SUCCESS; -} - -/**********************************************************************/ -int uds_timed_wait_cond(struct cond_var *cv, - struct mutex *mutex, - ktime_t timeout) -{ - bool happened; - event_token_t token = event_count_prepare(cv->event_count); - uds_unlock_mutex(mutex); - happened = event_count_wait(cv->event_count, token, &timeout); - uds_lock_mutex(mutex); - return happened ? UDS_SUCCESS : ETIMEDOUT; -} - -/**********************************************************************/ -int uds_destroy_cond(struct cond_var *cv) -{ - free_event_count(cv->event_count); - cv->event_count = NULL; - return UDS_SUCCESS; -} diff --git a/uds/threadDevice.c b/uds/threadDevice.c deleted file mode 100644 index f3201b83..00000000 --- a/uds/threadDevice.c +++ /dev/null @@ -1,58 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/krusty/kernelLinux/uds/threadDevice.c#5 $ - */ - -#include "threadDevice.h" - -#include "threadRegistry.h" - -/* - * A registry of all threads temporarily associated with particular - * VDO devices. - */ -static struct thread_registry device_id_thread_registry; - -/**********************************************************************/ -void uds_register_thread_device_id(struct registered_thread *new_thread, - unsigned int *id_ptr) -{ - uds_register_thread(&device_id_thread_registry, new_thread, id_ptr); -} - -/**********************************************************************/ -void uds_unregister_thread_device_id(void) -{ - uds_unregister_thread(&device_id_thread_registry); -} - -/**********************************************************************/ -int uds_get_thread_device_id(void) -{ - const unsigned int *pointer = - uds_lookup_thread(&device_id_thread_registry); - - return pointer ? *pointer : -1; -} - -/**********************************************************************/ -void uds_initialize_thread_device_registry(void) -{ - uds_initialize_thread_registry(&device_id_thread_registry); -} diff --git a/uds/threadDevice.h b/uds/threadDevice.h deleted file mode 100644 index e8b0dc1f..00000000 --- a/uds/threadDevice.h +++ /dev/null @@ -1,61 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/krusty/kernelLinux/uds/threadDevice.h#3 $ - */ - -#ifndef UDS_THREAD_DEVICE_H -#define UDS_THREAD_DEVICE_H - -#include "threadRegistry.h" - -/** - * Temporarily register the current thread as being associated with a - * VDO device id number, for logging purposes. - * - * Any such registered thread must later be unregistered via - * unregister_thread_device_id. - * - * The pointed-to ID number should be nonzero. - * - * @param new_thread registered_thread structure to use for the current thread - * @param id_ptr Location where the ID number is stored - **/ -void uds_register_thread_device_id(struct registered_thread *new_thread, - unsigned int *id_ptr); - -/** - * Cancel registration of the current thread as being associated with - * a VDO device or device ID number. - **/ -void uds_unregister_thread_device_id(void); - -/** - * Get the VDO device ID number temporarily associated with the - * current thread, if any. - * - * @return the device ID number, if any, or -1 - **/ -int uds_get_thread_device_id(void); - -/** - * Initialize the thread device-ID registry. - **/ -void uds_initialize_thread_device_registry(void); - -#endif /* UDS_THREAD_DEVICE_H */ diff --git a/uds/threadOnce.c b/uds/threadOnce.c deleted file mode 100644 index 4adbbc19..00000000 --- a/uds/threadOnce.c +++ /dev/null @@ -1,49 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/krusty/src/uds/threadOnce.c#10 $ - */ - -#include "errors.h" -#include "uds-threads.h" - -enum { - ONCE_NOT_DONE = 0, - ONCE_IN_PROGRESS = 1, - ONCE_COMPLETE = 2, -}; - -/**********************************************************************/ -void perform_once(once_state_t *once, void (*function)(void)) -{ - for (;;) { - switch (atomic_cmpxchg(once, ONCE_NOT_DONE, ONCE_IN_PROGRESS)) { - case ONCE_NOT_DONE: - function(); - atomic_set_release(once, ONCE_COMPLETE); - return; - case ONCE_IN_PROGRESS: - uds_yield_scheduler(); - break; - case ONCE_COMPLETE: - return; - default: - return; - } - } -} diff --git a/uds/threadOnce.h b/uds/threadOnce.h deleted file mode 100644 index 7ed0d691..00000000 --- a/uds/threadOnce.h +++ /dev/null @@ -1,46 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/krusty/src/uds/threadOnce.h#7 $ - */ - -#ifndef THREAD_ONCE_H -#define THREAD_ONCE_H - -#include - -#define ONCE_STATE_INITIALIZER ATOMIC_INIT(0) - -typedef atomic_t once_state_t; - -/** - * Thread safe once only initialization. - * - * @param once_state pointer to object to record that initialization - * has been performed - * @param init_function called if once_state does not indicate - * initialization has been performed - * - * @note Generally the following declaration of once_state is performed in - * at file scope: - * - * static once_state_t once_state = ONCE_STATE_INITIALIZER; - **/ -void perform_once(once_state_t *once_state, void (*init_function) (void)); - -#endif /* THREAD_ONCE_H */ diff --git a/uds/threadRegistry.h b/uds/threadRegistry.h deleted file mode 100644 index 05420c24..00000000 --- a/uds/threadRegistry.h +++ /dev/null @@ -1,83 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/krusty/kernelLinux/uds/threadRegistry.h#7 $ - */ - -#ifndef THREAD_REGISTRY_H -#define THREAD_REGISTRY_H - -#include -#include - -/* - * We don't expect this set to ever get really large, so a linked list - * is adequate. - */ - -struct thread_registry { - struct list_head links; - spinlock_t lock; -}; - -struct registered_thread { - struct list_head links; - const void *pointer; - struct task_struct *task; -}; - -/** - * Initialize a registry of threads and associated data pointers. - * - * @param registry The registry to initialize - **/ -void uds_initialize_thread_registry(struct thread_registry *registry); - -/** - * Register the current thread and associate it with a data pointer. - * - * This call will log messages if the thread is already registered. - * - * @param registry The thread registry - * @param new_thread registered_thread structure to use for the current thread - * @param pointer The value to associate with the current thread - **/ -void uds_register_thread(struct thread_registry *registry, - struct registered_thread *new_thread, - const void *pointer); - -/** - * Remove the registration for the current thread. - * - * A message may be logged if the thread was not registered. - * - * @param registry The thread registry - **/ -void uds_unregister_thread(struct thread_registry *registry); - -/** - * Fetch a pointer that may have been registered for the current - * thread. If the thread is not registered, a null pointer is returned. - * - * @param registry The thread registry - * - * @return the registered pointer, if any, or NULL - **/ -const void *uds_lookup_thread(struct thread_registry *registry); - -#endif /* THREAD_REGISTRY_H */ diff --git a/uds/timeUtils.c b/uds/timeUtils.c deleted file mode 100644 index ba178584..00000000 --- a/uds/timeUtils.c +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/krusty/src/uds/timeUtils.c#19 $ - */ - -#include "permassert.h" -#include "stringUtils.h" -#include "timeUtils.h" - -#include -#include // for getnstimeofday on Vivid - - -/**********************************************************************/ -int64_t current_time_us(void) -{ - return current_time_ns(CLOCK_REALTIME) / NSEC_PER_USEC; -} - - - diff --git a/uds/typeDefs.h b/uds/typeDefs.h deleted file mode 100644 index c65a5053..00000000 --- a/uds/typeDefs.h +++ /dev/null @@ -1,49 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/krusty/src/uds/typeDefs.h#7 $ - */ - -#ifndef TYPE_DEFS_H -#define TYPE_DEFS_H - -/* - * General system type definitions. - */ - -#include -#include -#include - -typedef unsigned char byte; - -#define CHAR_BIT 8 - -#define INT64_MAX (9223372036854775807L) -#define UCHAR_MAX ((unsigned char)~0ul) -#define UINT8_MAX ((uint8_t)~0ul) -#define UINT16_MAX ((uint16_t)~0ul) -#define UINT64_MAX ((uint64_t)~0ul) - -// Some recent versions of define this for us -#ifndef SIZE_MAX -#define SIZE_MAX ((size_t)~0ul) -#endif /* SIZE_MAX */ - - -#endif /* TYPE_DEFS_H */ diff --git a/uds/uds-platform.h b/uds/uds-platform.h deleted file mode 100644 index 4ea7c39f..00000000 --- a/uds/uds-platform.h +++ /dev/null @@ -1,32 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/krusty/src/uds/uds-platform.h#2 $ - */ - -/** - * @file - * @brief Platform definitions for albireo - **/ -#ifndef UDS_PLATFORM_H -#define UDS_PLATFORM_H - - -#include - -#endif /* UDS_PLATFORM_H */ diff --git a/uds/udsMain.c b/uds/udsMain.c deleted file mode 100644 index ad353026..00000000 --- a/uds/udsMain.c +++ /dev/null @@ -1,385 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/krusty/src/uds/udsMain.c#38 $ - */ - -#include "uds.h" - -#include "config.h" -#include "geometry.h" -#include "index.h" -#include "indexLayout.h" -#include "indexSession.h" -#include "loadType.h" -#include "logger.h" -#include "memoryAlloc.h" - -/* Memory size constants */ -const uds_memory_config_size_t UDS_MEMORY_CONFIG_MAX = 1024; -const uds_memory_config_size_t UDS_MEMORY_CONFIG_256MB = 0xffffff00; // -256 -const uds_memory_config_size_t UDS_MEMORY_CONFIG_512MB = 0xfffffe00; // -512 -const uds_memory_config_size_t UDS_MEMORY_CONFIG_768MB = 0xfffffd00; // -768 - -/* Memory size constants for volumes that have one less chapter */ -const uds_memory_config_size_t UDS_MEMORY_CONFIG_REDUCED = 0x1000; -const uds_memory_config_size_t UDS_MEMORY_CONFIG_REDUCED_MAX = 1024 | 0x1000; -const uds_memory_config_size_t UDS_MEMORY_CONFIG_REDUCED_256MB = - 0xfffffb00; // -1280 -const uds_memory_config_size_t UDS_MEMORY_CONFIG_REDUCED_512MB = - 0xfffffa00; // -1536 -const uds_memory_config_size_t UDS_MEMORY_CONFIG_REDUCED_768MB = - 0xfffff900; // -1792 - -/* - * =========================================================================== - * UDS system management - * =========================================================================== - */ - -/**********************************************************************/ -int uds_initialize_configuration(struct uds_configuration **user_config, - uds_memory_config_size_t mem_gb) -{ - unsigned int chapters_per_volume, record_pages_per_chapter; - int result; - if (user_config == NULL) { - uds_log_error("missing configuration pointer"); - return -EINVAL; - } - - /* Set the configuration parameters that change with memory size. If - * you change these values, you should also: - * - * Change Configuration_x1, which tests these values and expects to see - * them - * - * Bump the index configuration version number. This bump ensures that - * the test infrastructure will be forced to test the new - * configuration. - */ - - if (mem_gb == UDS_MEMORY_CONFIG_256MB) { - chapters_per_volume = DEFAULT_CHAPTERS_PER_VOLUME; - record_pages_per_chapter = SMALL_RECORD_PAGES_PER_CHAPTER; - } else if (mem_gb == UDS_MEMORY_CONFIG_512MB) { - chapters_per_volume = DEFAULT_CHAPTERS_PER_VOLUME; - record_pages_per_chapter = 2 * SMALL_RECORD_PAGES_PER_CHAPTER; - } else if (mem_gb == UDS_MEMORY_CONFIG_768MB) { - chapters_per_volume = DEFAULT_CHAPTERS_PER_VOLUME; - record_pages_per_chapter = 3 * SMALL_RECORD_PAGES_PER_CHAPTER; - } else if ((mem_gb >= 1) && (mem_gb <= UDS_MEMORY_CONFIG_MAX)) { - chapters_per_volume = mem_gb * DEFAULT_CHAPTERS_PER_VOLUME; - record_pages_per_chapter = DEFAULT_RECORD_PAGES_PER_CHAPTER; - } else if (mem_gb == UDS_MEMORY_CONFIG_REDUCED_256MB) { - chapters_per_volume = DEFAULT_CHAPTERS_PER_VOLUME - 1; - record_pages_per_chapter = SMALL_RECORD_PAGES_PER_CHAPTER; - } else if (mem_gb == UDS_MEMORY_CONFIG_REDUCED_512MB) { - chapters_per_volume = DEFAULT_CHAPTERS_PER_VOLUME - 1; - record_pages_per_chapter = 2 * SMALL_RECORD_PAGES_PER_CHAPTER; - } else if (mem_gb == UDS_MEMORY_CONFIG_REDUCED_768MB) { - chapters_per_volume = DEFAULT_CHAPTERS_PER_VOLUME - 1; - record_pages_per_chapter = 3 * SMALL_RECORD_PAGES_PER_CHAPTER; - } else if ((mem_gb >= 1 + UDS_MEMORY_CONFIG_REDUCED) && - (mem_gb <= UDS_MEMORY_CONFIG_REDUCED_MAX)) { - chapters_per_volume = (mem_gb - UDS_MEMORY_CONFIG_REDUCED) * - DEFAULT_CHAPTERS_PER_VOLUME - 1; - record_pages_per_chapter = DEFAULT_RECORD_PAGES_PER_CHAPTER; - } else { - uds_log_error("received invalid memory size"); - return -EINVAL; - } - - result = UDS_ALLOCATE(1, struct uds_configuration, "uds_configuration", - user_config); - if (result != UDS_SUCCESS) { - return uds_map_to_system_error(result); - } - - (*user_config)->record_pages_per_chapter = record_pages_per_chapter; - (*user_config)->chapters_per_volume = chapters_per_volume; - (*user_config)->sparse_chapters_per_volume = - DEFAULT_SPARSE_CHAPTERS_PER_VOLUME; - (*user_config)->cache_chapters = DEFAULT_CACHE_CHAPTERS; - (*user_config)->checkpoint_frequency = DEFAULT_CHECKPOINT_FREQUENCY; - (*user_config)->volume_index_mean_delta = - DEFAULT_VOLUME_INDEX_MEAN_DELTA; - (*user_config)->bytes_per_page = DEFAULT_BYTES_PER_PAGE; - (*user_config)->sparse_sample_rate = DEFAULT_SPARSE_SAMPLE_RATE; - (*user_config)->nonce = 0; - return UDS_SUCCESS; -} - -/**********************************************************************/ -void uds_configuration_set_sparse(struct uds_configuration *user_config, - bool sparse) -{ - unsigned int prev_chapters_per_volume; - unsigned int reduced_chapters; - bool prev_sparse = (user_config->sparse_chapters_per_volume != 0); - if (sparse == prev_sparse) { - // nothing to do - return; - } - - // Compute pre-conversion chapter count for sizing. - reduced_chapters = user_config->chapters_per_volume % 2; - prev_chapters_per_volume = - user_config->chapters_per_volume + reduced_chapters; - if (sparse) { - // Index 10TB with 4K blocks, 95% sparse, fit in dense (1TB) - // footprint - user_config->chapters_per_volume = - (10 * prev_chapters_per_volume) - reduced_chapters; - user_config->sparse_chapters_per_volume = - 9 * prev_chapters_per_volume + - prev_chapters_per_volume / 2; - user_config->sparse_sample_rate = 32; - } else { - user_config->chapters_per_volume = - (prev_chapters_per_volume / 10) - reduced_chapters; - user_config->sparse_chapters_per_volume = 0; - user_config->sparse_sample_rate = 0; - } -} - -/**********************************************************************/ -bool uds_configuration_get_sparse(struct uds_configuration *user_config) -{ - return user_config->sparse_chapters_per_volume > 0; -} - -/**********************************************************************/ -void uds_configuration_set_nonce(struct uds_configuration *user_config, - uds_nonce_t nonce) -{ - user_config->nonce = nonce; -} - -/**********************************************************************/ -uds_nonce_t uds_configuration_get_nonce(struct uds_configuration *user_config) -{ - return user_config->nonce; -} - -/**********************************************************************/ -unsigned int -uds_configuration_get_memory(struct uds_configuration *user_config) -{ - unsigned int memory = 0; - unsigned int chapters = uds_configuration_get_sparse(user_config) ? - user_config->chapters_per_volume / 10 : - user_config->chapters_per_volume; - - if ((chapters % DEFAULT_CHAPTERS_PER_VOLUME) == 0) { - switch (user_config->record_pages_per_chapter) { - case SMALL_RECORD_PAGES_PER_CHAPTER: - memory = UDS_MEMORY_CONFIG_256MB; - break; - case 2 * SMALL_RECORD_PAGES_PER_CHAPTER: - memory = UDS_MEMORY_CONFIG_512MB; - break; - case 3 * SMALL_RECORD_PAGES_PER_CHAPTER: - memory = UDS_MEMORY_CONFIG_768MB; - break; - default: - memory = chapters / DEFAULT_CHAPTERS_PER_VOLUME; - } - } else { - switch (user_config->record_pages_per_chapter) { - case SMALL_RECORD_PAGES_PER_CHAPTER: - memory = UDS_MEMORY_CONFIG_REDUCED_256MB; - break; - case 2 * SMALL_RECORD_PAGES_PER_CHAPTER: - memory = UDS_MEMORY_CONFIG_REDUCED_512MB; - break; - case 3 * SMALL_RECORD_PAGES_PER_CHAPTER: - memory = UDS_MEMORY_CONFIG_REDUCED_768MB; - break; - default: - memory = (chapters + 1) / DEFAULT_CHAPTERS_PER_VOLUME + - UDS_MEMORY_CONFIG_REDUCED; - } - } - return memory; -} - -/**********************************************************************/ -unsigned int -uds_configuration_get_chapters_per_volume(struct uds_configuration *user_config) -{ - return user_config->chapters_per_volume; -} - -/**********************************************************************/ -void uds_free_configuration(struct uds_configuration *user_config) -{ - UDS_FREE(user_config); -} - -/**********************************************************************/ -int uds_create_index_session(struct uds_index_session **session) -{ - struct uds_index_session *index_session = NULL; - int result; - if (session == NULL) { - uds_log_error("missing session pointer"); - return -EINVAL; - } - - result = make_empty_index_session(&index_session); - if (result != UDS_SUCCESS) { - return uds_map_to_system_error(result); - } - - *session = index_session; - return UDS_SUCCESS; -} - -/**********************************************************************/ -static int -initialize_index_session_with_layout(struct uds_index_session *index_session, - struct index_layout *layout, - const struct uds_parameters *user_params, - enum load_type load_type) -{ - struct configuration *index_config; - int result = ((load_type == LOAD_CREATE) ? - write_uds_index_config(layout, - &index_session->user_config, 0) : - verify_uds_index_config(layout, - &index_session->user_config)); - if (result != UDS_SUCCESS) { - return result; - } - - result = make_configuration(&index_session->user_config, - &index_config); - if (result != UDS_SUCCESS) { - uds_log_error_strerror(result, "Failed to allocate config"); - return result; - } - - // Zero the stats for the new index. - memset(&index_session->stats, 0, sizeof(index_session->stats)); - - result = make_index(layout, - index_config, - user_params, - load_type, - &index_session->load_context, - enter_callback_stage, - &index_session->index); - free_configuration(index_config); - if (result != UDS_SUCCESS) { - uds_log_error_strerror(result, "Failed to make index"); - return result; - } - - log_uds_configuration(&index_session->user_config); - return UDS_SUCCESS; -} - -/**********************************************************************/ -static int initialize_index_session(struct uds_index_session *index_session, - const char *name, - const struct uds_parameters *user_params, - enum load_type load_type) -{ - struct index_layout *layout; - int result = make_uds_index_layout(name, - load_type == LOAD_CREATE, - &index_session->user_config, - &layout); - if (result != UDS_SUCCESS) { - return result; - } - - result = initialize_index_session_with_layout(index_session, layout, - user_params, load_type); - put_uds_index_layout(layout); - return result; -} - -/**********************************************************************/ -int uds_open_index(enum uds_open_index_type open_type, - const char *name, - const struct uds_parameters *user_params, - struct uds_configuration *user_config, - struct uds_index_session *session) -{ - int result; - enum load_type load_type; - - if (name == NULL) { - uds_log_error("missing required index name"); - return -EINVAL; - } - if (user_config == NULL) { - uds_log_error("missing required configuration"); - return -EINVAL; - } - if (session == NULL) { - uds_log_error("missing required session pointer"); - return -EINVAL; - } - - result = start_loading_index_session(session); - if (result != UDS_SUCCESS) { - return uds_map_to_system_error(result); - } - - session->user_config = *user_config; - - // Map the external open_type to the internal load_type - load_type = open_type == UDS_CREATE ? - LOAD_CREATE : - open_type == UDS_NO_REBUILD ? LOAD_LOAD : LOAD_REBUILD; - uds_log_notice("%s: %s", get_load_type(load_type), name); - - result = initialize_index_session(session, name, user_params, - load_type); - if (result != UDS_SUCCESS) { - uds_log_error_strerror(result, "Failed %s", - get_load_type(load_type)); - save_and_free_index(session); - } - - finish_loading_index_session(session, result); - return uds_map_to_system_error(result); -} - -/**********************************************************************/ -const char *uds_get_version(void) -{ -#ifdef UDS_VERSION - return UDS_VERSION; -#else - return "internal version"; -#endif -} - -/**********************************************************************/ -const char *uds_string_error(int errnum, char *buf, size_t buflen) -{ - if (buf == NULL) { - return NULL; - } - - return string_error(errnum, buf, buflen); -} diff --git a/uds/udsModule.c b/uds/udsModule.c deleted file mode 100644 index 47aab872..00000000 --- a/uds/udsModule.c +++ /dev/null @@ -1,177 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/krusty/kernelLinux/uds/udsModule.c#125 $ - */ - -#include - -#include "buffer.h" -#include "errors.h" -#include "logger.h" -#include "memoryAlloc.h" -#include "murmur/MurmurHash3.h" -#include "sysfs.h" -#include "threadDevice.h" -#include "threadOnce.h" -#include "timeUtils.h" -#include "uds.h" -#include "util/funnelQueue.h" - -/**********************************************************************/ -static int __init dedupe_init(void) -{ - uds_initialize_thread_device_registry(); - uds_memory_init(); - uds_log_info("loaded version %s", UDS_VERSION); - init_uds_sysfs(); - return 0; -} - -/**********************************************************************/ -static void __exit dedupe_exit(void) -{ - put_uds_sysfs(); - uds_memory_exit(); - uds_log_info("unloaded version %s", UDS_VERSION); -} - -/**********************************************************************/ -module_init(dedupe_init); -module_exit(dedupe_exit); - -EXPORT_SYMBOL_GPL(UDS_MEMORY_CONFIG_256MB); -EXPORT_SYMBOL_GPL(UDS_MEMORY_CONFIG_512MB); -EXPORT_SYMBOL_GPL(UDS_MEMORY_CONFIG_768MB); -EXPORT_SYMBOL_GPL(UDS_MEMORY_CONFIG_MAX); -EXPORT_SYMBOL_GPL(UDS_MEMORY_CONFIG_REDUCED); -EXPORT_SYMBOL_GPL(UDS_MEMORY_CONFIG_REDUCED_256MB); -EXPORT_SYMBOL_GPL(UDS_MEMORY_CONFIG_REDUCED_512MB); -EXPORT_SYMBOL_GPL(UDS_MEMORY_CONFIG_REDUCED_768MB); -EXPORT_SYMBOL_GPL(UDS_MEMORY_CONFIG_REDUCED_MAX); -EXPORT_SYMBOL_GPL(uds_close_index); -EXPORT_SYMBOL_GPL(uds_compute_index_size); -EXPORT_SYMBOL_GPL(uds_configuration_get_chapters_per_volume); -EXPORT_SYMBOL_GPL(uds_configuration_get_memory); -EXPORT_SYMBOL_GPL(uds_configuration_get_nonce); -EXPORT_SYMBOL_GPL(uds_configuration_get_sparse); -EXPORT_SYMBOL_GPL(uds_configuration_set_nonce); -EXPORT_SYMBOL_GPL(uds_configuration_set_sparse); -EXPORT_SYMBOL_GPL(uds_create_index_session); -EXPORT_SYMBOL_GPL(uds_destroy_index_session); -EXPORT_SYMBOL_GPL(uds_flush_index_session); -EXPORT_SYMBOL_GPL(uds_free_configuration); -EXPORT_SYMBOL_GPL(uds_get_index_configuration); -EXPORT_SYMBOL_GPL(uds_get_index_stats); -EXPORT_SYMBOL_GPL(uds_get_version); -EXPORT_SYMBOL_GPL(uds_initialize_configuration); -EXPORT_SYMBOL_GPL(uds_open_index); -EXPORT_SYMBOL_GPL(uds_resume_index_session); -EXPORT_SYMBOL_GPL(uds_start_chunk_operation); -EXPORT_SYMBOL_GPL(uds_string_error); -EXPORT_SYMBOL_GPL(uds_suspend_index_session); - -EXPORT_SYMBOL_GPL(__uds_log_message); -EXPORT_SYMBOL_GPL(__uds_log_strerror); -EXPORT_SYMBOL_GPL(available_space); -EXPORT_SYMBOL_GPL(buffer_length); -EXPORT_SYMBOL_GPL(buffer_used); -EXPORT_SYMBOL_GPL(clear_buffer); -EXPORT_SYMBOL_GPL(compact_buffer); -EXPORT_SYMBOL_GPL(content_length); -EXPORT_SYMBOL_GPL(copy_bytes); -EXPORT_SYMBOL_GPL(current_time_us); -EXPORT_SYMBOL_GPL(ensure_available_space); -EXPORT_SYMBOL_GPL(equal_buffers); -EXPORT_SYMBOL_GPL(free_buffer); -EXPORT_SYMBOL_GPL(free_funnel_queue); -EXPORT_SYMBOL_GPL(funnel_queue_poll); -EXPORT_SYMBOL_GPL(get_boolean); -EXPORT_SYMBOL_GPL(get_buffer_contents); -EXPORT_SYMBOL_GPL(get_byte); -EXPORT_SYMBOL_GPL(get_bytes_from_buffer); -EXPORT_SYMBOL_GPL(get_uds_log_level); -EXPORT_SYMBOL_GPL(get_uds_memory_stats); -EXPORT_SYMBOL_GPL(get_uint16_le_from_buffer); -EXPORT_SYMBOL_GPL(get_uint16_les_from_buffer); -EXPORT_SYMBOL_GPL(get_uint32_le_from_buffer); -EXPORT_SYMBOL_GPL(get_uint64_le_from_buffer); -EXPORT_SYMBOL_GPL(get_uint64_les_from_buffer); -EXPORT_SYMBOL_GPL(has_same_bytes); -EXPORT_SYMBOL_GPL(is_funnel_queue_empty); -EXPORT_SYMBOL_GPL(make_buffer); -EXPORT_SYMBOL_GPL(make_funnel_queue); -EXPORT_SYMBOL_GPL(MurmurHash3_x64_128); -EXPORT_SYMBOL_GPL(perform_once); -EXPORT_SYMBOL_GPL(put_boolean); -EXPORT_SYMBOL_GPL(put_buffer); -EXPORT_SYMBOL_GPL(put_byte); -EXPORT_SYMBOL_GPL(put_bytes); -EXPORT_SYMBOL_GPL(put_int64_le_into_buffer); -EXPORT_SYMBOL_GPL(put_uint16_le_into_buffer); -EXPORT_SYMBOL_GPL(put_uint16_les_into_buffer); -EXPORT_SYMBOL_GPL(put_uint32_le_into_buffer); -EXPORT_SYMBOL_GPL(put_uint64_le_into_buffer); -EXPORT_SYMBOL_GPL(put_uint64_les_into_buffer); -EXPORT_SYMBOL_GPL(register_error_block); -EXPORT_SYMBOL_GPL(report_uds_memory_usage); -EXPORT_SYMBOL_GPL(reset_buffer_end); -EXPORT_SYMBOL_GPL(rewind_buffer); -EXPORT_SYMBOL_GPL(set_uds_log_level); -EXPORT_SYMBOL_GPL(skip_forward); -EXPORT_SYMBOL_GPL(string_error); -EXPORT_SYMBOL_GPL(string_error_name); -EXPORT_SYMBOL_GPL(uds_alloc_sprintf); -EXPORT_SYMBOL_GPL(uds_allocate_memory); -EXPORT_SYMBOL_GPL(uds_allocate_memory_nowait); -EXPORT_SYMBOL_GPL(uds_append_to_buffer); -EXPORT_SYMBOL_GPL(uds_assertion_failed); -EXPORT_SYMBOL_GPL(uds_duplicate_string); -EXPORT_SYMBOL_GPL(uds_fixed_sprintf); -EXPORT_SYMBOL_GPL(uds_free_memory); -EXPORT_SYMBOL_GPL(uds_get_thread_device_id); -EXPORT_SYMBOL_GPL(uds_initialize_thread_registry); -EXPORT_SYMBOL_GPL(uds_log_backtrace); -EXPORT_SYMBOL_GPL(uds_log_priority_to_string); -EXPORT_SYMBOL_GPL(uds_log_string_to_priority); -EXPORT_SYMBOL_GPL(uds_lookup_thread); -EXPORT_SYMBOL_GPL(uds_parse_uint64); -EXPORT_SYMBOL_GPL(uds_pause_for_logger); -EXPORT_SYMBOL_GPL(uds_reallocate_memory); -EXPORT_SYMBOL_GPL(uds_register_allocating_thread); -EXPORT_SYMBOL_GPL(uds_register_thread); -EXPORT_SYMBOL_GPL(uds_register_thread_device_id); -EXPORT_SYMBOL_GPL(uds_string_to_unsigned_long); -EXPORT_SYMBOL_GPL(uds_unregister_allocating_thread); -EXPORT_SYMBOL_GPL(uds_unregister_thread); -EXPORT_SYMBOL_GPL(uds_unregister_thread_device_id); -EXPORT_SYMBOL_GPL(uds_v_append_to_buffer); -EXPORT_SYMBOL_GPL(uds_vlog_strerror); -EXPORT_SYMBOL_GPL(uncompacted_amount); -EXPORT_SYMBOL_GPL(wrap_buffer); -EXPORT_SYMBOL_GPL(zero_bytes); - -/**********************************************************************/ - - -/**********************************************************************/ - -MODULE_DESCRIPTION("deduplication engine"); -MODULE_AUTHOR("Red Hat, Inc."); -MODULE_LICENSE("GPL"); -MODULE_VERSION(UDS_VERSION); diff --git a/uds/util/eventCount.c b/uds/util/eventCount.c deleted file mode 100644 index af7cc087..00000000 --- a/uds/util/eventCount.c +++ /dev/null @@ -1,330 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/krusty/src/uds/util/eventCount.c#17 $ - */ - -/** - * This event count implementation uses a posix semaphore for portability, - * although a futex would be slightly superior to use and easy to substitute. - * It is designed to make signalling as cheap as possible, since that is the - * code path likely triggered on most updates to a lock-free data structure. - * Waiters are likely going to sleep, so optimizing for that case isn't - * necessary. - * - * The critical field is the state, which is really two fields that can be - * atomically updated in unison: an event counter and a waiter count. Every - * call to event_count_prepare() issues a wait token by atomically incrementing - * the waiter count. The key invariant is a strict accounting of the number of - * tokens issued. Every token returned by event_count_prepare() is a contract - * that the caller will call uds_acquire_semaphore() and a signaller will call - * uds_release_semaphore(), each exactly once. Atomic updates to the state - * field ensure that each token is counted once and that tokens are not lost. - * Cancelling a token attempts to take a fast-path by simply decrementing the - * waiters field, but if the token has already been claimed by a signaller, - * the canceller must still wait on the semaphore to consume the transferred - * token. - * - * The state field is 64 bits, partitioned into a 16-bit waiter field and a - * 48-bit counter. We are unlikely to have 2^16 threads, much less 2^16 - * threads waiting on any single event transition. 2^48 microseconds is - * several years, so a token holder would have to wait that long for the - * counter to wrap around, and then call event_count_wait() at the exact right - * time to see the re-used counter, in order to lose a wakeup due to counter - * wrap-around. Using a 32-bit state field would greatly increase that chance, - * but if forced to do so, the implementation could likely tolerate it since - * callers are supposed to hold tokens for miniscule periods of time. - * Fortunately, x64 has 64-bit compare-and-swap, and the performance of - * interlocked 64-bit operations appears to be about the same as for 32-bit - * ones, so being paranoid and using 64 bits costs us nothing. - * - * Here are some sequences of calls and state transitions: - * - * action postcondition - * counter waiters semaphore - * initialized 0 0 0 - * prepare 0 1 0 - * wait (blocks) 0 1 0 - * signal 1 0 1 - * wait (unblocks) 1 0 0 - * - * signal (fast-path) 1 0 0 - * signal (fast-path) 1 0 0 - * - * prepare A 1 1 0 - * prepare B 1 2 0 - * signal 2 0 2 - * wait B (fast-path) 2 0 1 - * wait A (fast-path) 2 0 0 - * - * prepare 2 1 0 - * cancel (fast-path) 2 0 0 - * - * prepare 2 1 0 - * signal 3 0 1 - * cancel (must wait) 3 0 0 - * - * The event count structure is aligned, sized, and allocated to cache line - * boundaries to avoid any false sharing between the event count and other - * shared state. The state field and semaphore should fit on a single cache - * line. The instrumentation counters increase the size of the structure so it - * rounds up to use two (64-byte x86) cache lines. - * - * XXX Need interface to access or display instrumentation counters. - **/ - -#include "eventCount.h" - -#include - -#include "common.h" -#include "compiler.h" -#include "cpu.h" -#include "logger.h" -#include "memoryAlloc.h" -#include "uds-threads.h" - -enum { - ONE_WAITER = 1, // value used to increment the waiters field - ONE_EVENT = (1 << 16), // value used to increment the event counter - WAITERS_MASK = (ONE_EVENT - 1), // bit mask to access the waiters field - EVENTS_MASK = ~WAITERS_MASK, // bit mask to access the event counter -}; - -struct event_count { - // Atomically mutable state: - // low 16 bits: the number of wait tokens not posted to the semaphore - // high 48 bits: current event counter - atomic64_t state; - - // Semaphore used to block threads when waiting is required. - struct semaphore semaphore; - - // Instrumentation counters. - - // Declare alignment so we don't share a cache line. -} __attribute__((aligned(CACHE_LINE_BYTES))); - -/** - * Test the event field in two tokens for equality. - * - * @return true iff the tokens contain the same event field value - **/ -static INLINE bool same_event(event_token_t token1, event_token_t token2) -{ - return ((token1 & EVENTS_MASK) == (token2 & EVENTS_MASK)); -} - -/**********************************************************************/ -void event_count_broadcast(struct event_count *ec) -{ - uint64_t waiters, state, old_state; - - // Even if there are no waiters (yet), we will need a memory barrier. - smp_mb(); - - state = old_state = atomic64_read(&ec->state); - do { - event_token_t new_state; - // Check if there are any tokens that have not yet been been - // transferred to the semaphore. This is the fast no-waiters - // path. - waiters = (state & WAITERS_MASK); - if (waiters == 0) { - // Fast path first time through--no need to signal or - // post if there are no observers. - return; - } - - /* - * Attempt to atomically claim all the wait tokens and bump the - * event count using an atomic compare-and-swap. This - * operation contains a memory barrier. - */ - new_state = ((state & ~WAITERS_MASK) + ONE_EVENT); - old_state = state; - state = atomic64_cmpxchg(&ec->state, old_state, new_state); - // The cmpxchg fails when we lose a race with a new waiter or - // another signaller, so try again. - } while (unlikely(state != old_state)); - - - /* - * Wake the waiters by posting to the semaphore. This effectively - * transfers the wait tokens to the semaphore. There's sadly no bulk - * post for posix semaphores, so we've got to loop to do them all. - */ - while (waiters-- > 0) { - uds_release_semaphore(&ec->semaphore); - } -} - -/** - * Attempt to cancel a prepared wait token by decrementing the - * number of waiters in the current state. This can only be done - * safely if the event count hasn't been bumped. - * - * @param ec the event count on which the wait token was issued - * @param token the wait to cancel - * - * @return true if the wait was cancelled, false if the caller must - * still wait on the semaphore - **/ -static INLINE bool fast_cancel(struct event_count *ec, event_token_t token) -{ - event_token_t current_token = atomic64_read(&ec->state); - while (same_event(current_token, token)) { - // Try to decrement the waiter count via compare-and-swap as if - // we had never prepared to wait. - event_token_t et = atomic64_cmpxchg(&ec->state, - current_token, - current_token - 1); - if (et == current_token) { - return true; - } - current_token = et; - } - return false; -} - -/** - * Consume a token from the semaphore, waiting (with an optional timeout) if - * one is not currently available. Also attempts to count the number of times - * we'll actually have to wait because there are no tokens (permits) available - * in the semaphore, and the number of times the wait times out. - * - * @param ec the event count instance - * @param timeout an optional timeout value to pass to uds_attempt_semaphore() - * - * @return true if a token was consumed, otherwise false only if a timeout - * was specified and we timed out - **/ -static bool consume_wait_token(struct event_count *ec, - const ktime_t *timeout) -{ - // Try to grab a token without waiting. - if (uds_attempt_semaphore(&ec->semaphore, 0)) { - return true; - } - - - if (timeout == NULL) { - uds_acquire_semaphore(&ec->semaphore); - } else if (!uds_attempt_semaphore(&ec->semaphore, *timeout)) { - return false; - } - return true; -} - -/**********************************************************************/ -int make_event_count(struct event_count **ec_ptr) -{ - // The event count will be allocated on a cache line boundary so there - // will not be false sharing of the line with any other data structure. - struct event_count *ec = NULL; - int result = UDS_ALLOCATE(1, struct event_count, "event count", &ec); - if (result != UDS_SUCCESS) { - return result; - } - - atomic64_set(&ec->state, 0); - result = uds_initialize_semaphore(&ec->semaphore, 0); - if (result != UDS_SUCCESS) { - UDS_FREE(ec); - return result; - } - - *ec_ptr = ec; - return UDS_SUCCESS; -} - -/**********************************************************************/ -void free_event_count(struct event_count *ec) -{ - if (ec == NULL) { - return; - } - uds_destroy_semaphore(&ec->semaphore); - UDS_FREE(ec); -} - -/**********************************************************************/ -event_token_t event_count_prepare(struct event_count *ec) -{ - return atomic64_add_return(ONE_WAITER, &ec->state); -} - -/**********************************************************************/ -void event_count_cancel(struct event_count *ec, event_token_t token) -{ - // Decrement the waiter count if the event hasn't been signalled. - if (fast_cancel(ec, token)) { - return; - } - // A signaller has already transferred (or promised to transfer) our - // token to the semaphore, so we must consume it from the semaphore by - // waiting. - event_count_wait(ec, token, NULL); -} - -/**********************************************************************/ -bool event_count_wait(struct event_count *ec, - event_token_t token, - const ktime_t *timeout) -{ - - for (;;) { - // Wait for a signaller to transfer our wait token to the - // semaphore. - if (!consume_wait_token(ec, timeout)) { - // The wait timed out, so we must cancel the token - // instead. Try to decrement the waiter count if the - // event hasn't been signalled. - if (fast_cancel(ec, token)) { - return false; - } - /* - * We timed out, but a signaller came in before we - * could cancel the wait. We have no choice but to wait - * for the semaphore to be posted. Since signaller has - * promised to do it, the wait will be short. The - * timeout and the signal happened at about the same - * time, so either outcome could be returned. It's - * simpler to ignore the timeout. - */ - timeout = NULL; - continue; - } - - // A wait token has now been consumed from the semaphore. - - // Stop waiting if the count has changed since the token was - // acquired. - if (!same_event(token, atomic64_read(&ec->state))) { - return true; - } - - // We consumed someone else's wait token. Put it back in the - // semaphore, which will wake another waiter, hopefully one who - // can stop waiting. - uds_release_semaphore(&ec->semaphore); - - // Attempt to give an earlier waiter a shot at the semaphore. - uds_yield_scheduler(); - } -} diff --git a/uds/util/eventCount.h b/uds/util/eventCount.h deleted file mode 100644 index 6ba49b23..00000000 --- a/uds/util/eventCount.h +++ /dev/null @@ -1,128 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/krusty/src/uds/util/eventCount.h#9 $ - */ - -#ifndef EVENT_COUNT_H -#define EVENT_COUNT_H - -#include "timeUtils.h" -#include "typeDefs.h" - -/** - * An event count is a lock-free equivalent of a condition variable. - * - * Using an event count, a lock-free producer/consumer can wait for a state - * change (adding an item to an empty queue, for example) without spinning or - * falling back on the use of mutex-based locks. Signalling is cheap when - * there are no waiters (a memory fence), and preparing to wait is - * also inexpensive (an atomic add instruction). - * - * A lock-free producer should call event_count_broadcast() after any mutation - * to the lock-free data structure that a consumer might be waiting on. The - * consumers should poll for work like this: - * - * for (;;) { - * // Fast path--no additional cost to consumer. - * if (lockFreeDequeue(&item)) { - * return item; - * } - * // Two-step wait: get current token and poll state, either cancelling - * // the wait or waiting for the token to be signalled. - * event_token_t token = event_count_prepare(ec); - * if (lockFreeDequeue(&item)) { - * event_count_cancel(ec, token); - * return item; - * } - * event_count_wait(ec, token, NULL); - * // State has changed, but must check condition again, so loop. - * } - * - * Once event_count_prepare() is called, the caller should neither dally, - * sleep, nor perform long-running or blocking actions before passing the token - * to event_count_cancel() or event_count_wait(). The implementation is - * xoptimized for a short polling window, and will not perform well if there - * are outstanding tokens that have been signalled but not waited upon. - **/ - -struct event_count; - -typedef unsigned int event_token_t; - -/** - * Allocate and initialize a struct event_count. - * - * @param ec_ptr a pointer to hold the new struct event_count - **/ -int __must_check make_event_count(struct event_count **ec_ptr); - -/** - * Free a struct event_count. It must no longer be in use. - * - * @param ec the struct event_count to free - **/ -void free_event_count(struct event_count *ec); - -/** - * Wake all threads that are waiting for the next event. - * - * @param ec the struct event_count to signal - **/ -void event_count_broadcast(struct event_count *ec); - -/** - * Prepare to wait for the event count to change by capturing a token of its - * current state. The caller MUST eventually either call event_count_wait() or - * event_count_cancel() exactly once for each token obtained. - * - * @param ec the struct event_count on which to prepare to wait - * - * @return an event_token_t to be passed to the next event_count_wait() call - **/ -event_token_t __must_check event_count_prepare(struct event_count *ec); - -/** - * Cancel a wait token that has been prepared but not waited upon. This must - * be called after event_count_prepare() when event_count_wait() is not going to - * be invoked on the token. - * - * @param ec the struct event_count from which a wait token was obtained - * @param token the wait token that will never be passed to event_count_wait() - **/ -void event_count_cancel(struct event_count *ec, event_token_t token); - -/** - * Check if the current event count state corresponds to the provided token, - * and if it is, wait for a signal that the state has changed. If an optional - * timeout is provided, the wait will terminate after the timeout has elapsed. - * Timing out automatically cancels the wait token, so callers must not - * attempt to cancel the token on timeout. - * - * @param ec the struct event_count on which to wait - * @param token the event_token_t returned by event_count_prepare() - * @param timeout either NULL or a nanosecond timeout for the wait operation - * - * @return true if the state has already changed or if signalled, otherwise - * false if a timeout was provided and the wait timed out - **/ -bool event_count_wait(struct event_count *ec, - event_token_t token, - const ktime_t *timeout); - -#endif /* EVENT_COUNT_H */ diff --git a/uds/util/funnelQueue.c b/uds/util/funnelQueue.c deleted file mode 100644 index 6dde6c8a..00000000 --- a/uds/util/funnelQueue.c +++ /dev/null @@ -1,178 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/krusty/src/uds/util/funnelQueue.c#12 $ - */ - -#include "funnelQueue.h" - -#include "memoryAlloc.h" -#include "permassert.h" -#include "uds.h" - -/**********************************************************************/ -int make_funnel_queue(struct funnel_queue **queue_ptr) -{ - // Allocate the queue on a cache line boundary so the producer and - // consumer fields in the structure will land on separate cache lines. - struct funnel_queue *queue; - int result = UDS_ALLOCATE(1, struct funnel_queue, "funnel queue", - &queue); - if (result != UDS_SUCCESS) { - return result; - } - - // Initialize the stub entry and put it in the queue, establishing the - // invariant that queue->newest and queue->oldest are never null. - queue->stub.next = NULL; - queue->newest = &queue->stub; - queue->oldest = &queue->stub; - - *queue_ptr = queue; - return UDS_SUCCESS; -} - -/**********************************************************************/ -void free_funnel_queue(struct funnel_queue *queue) -{ - UDS_FREE(queue); -} - -/**********************************************************************/ -static struct funnel_queue_entry *get_oldest(struct funnel_queue *queue) -{ - /* - * Barrier requirements: We need a read barrier between reading a - * "next" field pointer value and reading anything it points to. - * There's an accompanying barrier in funnel_queue_put between its - * caller setting up the entry and making it visible. - */ - struct funnel_queue_entry *oldest = queue->oldest; - struct funnel_queue_entry *next = oldest->next; - - if (oldest == &queue->stub) { - // When the oldest entry is the stub and it has no successor, - // the queue is logically empty. - if (next == NULL) { - return NULL; - } - // The stub entry has a successor, so the stub can be dequeued - // and ignored without breaking the queue invariants. - oldest = next; - queue->oldest = oldest; - // XXX Some platforms such as Alpha may require an - // additional barrier here. See - // https://lkml.org/lkml/2019/11/8/1021 - next = oldest->next; - } - - // We have a non-stub candidate to dequeue. If it lacks a successor, - // we'll need to put the stub entry back on the queue first. - if (next == NULL) { - struct funnel_queue_entry *newest = queue->newest; - if (oldest != newest) { - // Another thread has already swung queue->newest - // atomically, but not yet assigned previous->next. The - // queue is really still empty. - return NULL; - } - - // Put the stub entry back on the queue, ensuring a successor - // will eventually be seen. - funnel_queue_put(queue, &queue->stub); - - // Check again for a successor. - next = oldest->next; - if (next == NULL) { - // We lost a race with a producer who swapped - // queue->newest before we did, but who hasn't yet - // updated previous->next. Try again later. - return NULL; - } - } - return oldest; -} - -/**********************************************************************/ -struct funnel_queue_entry *funnel_queue_poll(struct funnel_queue *queue) -{ - struct funnel_queue_entry *oldest = get_oldest(queue); - if (oldest == NULL) { - return oldest; - } - - /* - * Dequeue the oldest entry and return it. Only one consumer thread may - * call this function, so no locking, atomic operations, or fences are - * needed; queue->oldest is owned by the consumer and oldest->next is - * never used by a producer thread after it is swung from NULL to - * non-NULL. - */ - queue->oldest = oldest->next; - /* - * Make sure the caller sees the proper stored data for this entry. - * - * Since we've already fetched the entry pointer we stored in - * "queue->oldest", this also ensures that on entry to the next call - * we'll properly see the dependent data. - */ - smp_rmb(); - /* - * If "oldest" is a very light-weight work item, we'll be looking - * for the next one very soon, so prefetch it now. - */ - prefetch_address(queue->oldest, true); - oldest->next = NULL; - return oldest; -} - -/**********************************************************************/ -bool is_funnel_queue_empty(struct funnel_queue *queue) -{ - return get_oldest(queue) == NULL; -} - -/**********************************************************************/ -bool is_funnel_queue_idle(struct funnel_queue *queue) -{ - /* - * Oldest is not the stub, so there's another entry, though if next is - * NULL we can't retrieve it yet. - */ - if (queue->oldest != &queue->stub) { - return false; - } - - /* - * Oldest is the stub, but newest has been updated by _put(); either - * there's another, retrievable entry in the list, or the list is - * officially empty but in the intermediate state of having an entry - * added. - * - * Whether anything is retrievable depends on whether stub.next has - * been updated and become visible to us, but for idleness we don't - * care. And due to memory ordering in _put(), the update to newest - * would be visible to us at the same time or sooner. - */ - if (queue->newest != &queue->stub) { - return false; - } - - // Otherwise, we're idle. - return true; -} diff --git a/uds/util/funnelQueue.h b/uds/util/funnelQueue.h deleted file mode 100644 index a8370cd6..00000000 --- a/uds/util/funnelQueue.h +++ /dev/null @@ -1,203 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/krusty/src/uds/util/funnelQueue.h#11 $ - */ - -#ifndef FUNNEL_QUEUE_H -#define FUNNEL_QUEUE_H - -#include - -#include "compiler.h" -#include "cpu.h" -#include "typeDefs.h" - -/** - * A funnel queue is a simple lock-free (almost) queue that accepts entries - * from multiple threads (multi-producer) and delivers them to a single thread - * (single-consumer). "Funnel" is an attempt to evoke the image of requests - * from more than one producer being "funneled down" to a single consumer. - * - * This is an unsynchronized but thread-safe data structure when used as - * intended. There is no mechanism to ensure that only one thread is consuming - * from the queue, so if that is done mistakenly, it will not be trapped, and - * the resulting behavior is undefined. Clients must not directly access or - * manipulate the internals, which are only exposed for the purpose of - * allowing the very simple enqueue operation to be in-lined. - * - * The implementation requires that a funnel_queue_entry structure (a link - * pointer) be embedded in the queue entries, and pointers to those structures - * are used exclusively by the queue. No macros are defined to template the - * queue, so the offset of the funnel_queue_entry in the records placed in the - * queue must all have a fixed offset so the client can derive their structure - * pointer from the entry pointer returned by funnel_queue_poll(). - * - * Callers are wholly responsible for allocating and freeing the entries. - * Entries may be freed as soon as they are returned since this queue is not - * susceptible to the "ABA problem" present in many lock-free data structures. - * The queue is dynamically allocated to ensure cache-line alignment, but no - * other dynamic allocation is used. - * - * The algorithm is not actually 100% lock-free. There is a single point in - * funnel_queue_put() at which a pre-empted producer will prevent the consumers - * from seeing items added to the queue by later producers, and only if the - * queue is short enough or the consumer fast enough for it to reach what was - * the end of the queue at the time of the pre-empt. - * - * The consumer function, funnel_queue_poll(), will return NULL when the queue - * is empty. To wait for data to consume, spin (if safe) or combine the queue - * with an event_count to signal the presence of new entries. - **/ - -/** - * The queue link structure that must be embedded in client entries. - **/ -struct funnel_queue_entry { - // The next (newer) entry in the queue. - struct funnel_queue_entry *volatile next; -}; - -/** - * The dynamically allocated queue structure, which is aligned to a cache line - * boundary when allocated. This should be consider opaque; it is exposed here - * so funnel_queue_put() can be in-lined. - **/ -struct __attribute__((aligned(CACHE_LINE_BYTES))) funnel_queue { - // The producers' end of the queue--an atomically exchanged pointer - // that will never be NULL. - struct funnel_queue_entry *volatile newest; - - // The consumer's end of the queue. Owned by the consumer and never - // NULL. - struct funnel_queue_entry *oldest - __attribute__((aligned(CACHE_LINE_BYTES))); - - // A re-usable dummy entry used to provide the non-NULL invariants - // above. - struct funnel_queue_entry stub; -}; - -/** - * Construct and initialize a new, empty queue. - * - * @param queue_ptr a pointer in which to store the queue - * - * @return UDS_SUCCESS or an error code - **/ -int __must_check make_funnel_queue(struct funnel_queue **queue_ptr); - -/** - * Free a queue. - * - * This will not free any entries in the queue. The caller must ensure that - * either the queue will be empty or that any entries in the queue will not be - * leaked by dropping the references from queue. - * - * @param queue the queue to free - **/ -void free_funnel_queue(struct funnel_queue *queue); - -/** - * Put an entry on the end of the queue. - * - * The entry pointer must be to the struct funnel_queue_entry embedded in the - * caller's data structure. The caller must be able to derive the address of - * the start of their data structure from the pointer that passed in here, so - * every entry in the queue must have the struct funnel_queue_entry at the same - * offset within the client's structure. - * - * @param queue the queue on which to place the entry - * @param entry the entry to be added to the queue - **/ -static INLINE void funnel_queue_put(struct funnel_queue *queue, - struct funnel_queue_entry *entry) -{ - struct funnel_queue_entry *previous; - /* - * Barrier requirements: All stores relating to the entry ("next" - * pointer, containing data structure fields) must happen before the - * previous->next store making it visible to the consumer. Also, the - * entry's "next" field initialization to NULL must happen before any - * other producer threads can see the entry (the xchg) and try to - * update the "next" field. - * - * xchg implements a full barrier. - */ - entry->next = NULL; - /* - * The xchg macro in the PPC kernel calls a function that takes a void* - * argument, triggering a warning about dropping the volatile - * qualifier. - */ -#pragma GCC diagnostic push -#if __GNUC__ >= 5 -#pragma GCC diagnostic ignored "-Wdiscarded-qualifiers" -#endif - previous = xchg(&queue->newest, entry); -#pragma GCC diagnostic pop - // Pre-empts between these two statements hide the rest of the queue - // from the consumer, preventing consumption until the following - // assignment runs. - previous->next = entry; -} - -/** - * Poll a queue, removing the oldest entry if the queue is not empty. This - * function must only be called from a single consumer thread. - * - * @param queue the queue from which to remove an entry - * - * @return the oldest entry in the queue, or NULL if the queue is empty. - **/ -struct funnel_queue_entry *__must_check -funnel_queue_poll(struct funnel_queue *queue); - -/** - * Check whether the funnel queue is empty or not. This function must only be - * called from a single consumer thread, as with funnel_queue_poll. - * - * If the queue is in a transition state with one or more entries being added - * such that the list view is incomplete, it may not be possible to retrieve an - * entry with the funnel_queue_poll() function. In such states this function - * will report an empty indication. - * - * @param queue the queue which to check for entries. - * - * @return true iff queue contains no entry which can be retrieved - **/ -bool __must_check is_funnel_queue_empty(struct funnel_queue *queue); - -/** - * Check whether the funnel queue is idle or not. This function must only be - * called from a single consumer thread, as with funnel_queue_poll. - * - * If the queue has entries available to be retrieved, it is not idle. If the - * queue is in a transition state with one or more entries being added such - * that the list view is incomplete, it may not be possible to retrieve an - * entry with the funnel_queue_poll() function, but the queue will not be - * considered idle. - * - * @param queue the queue which to check for entries. - * - * @return true iff queue contains no entry which can be retrieved nor is - * known to be having an entry added - **/ -bool __must_check is_funnel_queue_idle(struct funnel_queue *queue); - -#endif /* FUNNEL_QUEUE_H */ diff --git a/uds/volumeIndex005.h b/uds/volumeIndex005.h deleted file mode 100644 index 970a894f..00000000 --- a/uds/volumeIndex005.h +++ /dev/null @@ -1,55 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/krusty/src/uds/volumeIndex005.h#3 $ - */ - -#ifndef VOLUMEINDEX005_H -#define VOLUMEINDEX005_H 1 - -#include "volumeIndexOps.h" - -/** - * Make a new volume index. - * - * @param config The configuration of the volume index - * @param num_zones The number of zones - * @param volume_nonce The nonce used to authenticate the index - * @param volume_index Location to hold new volume index ptr - * - * @return error code or UDS_SUCCESS - **/ -int __must_check make_volume_index005(const struct configuration *config, - unsigned int num_zones, - uint64_t volume_nonce, - struct volume_index **volume_index); - -/** - * Compute the number of bytes required to save a volume index of a given - * configuration. - * - * @param config The configuration of the volume index - * @param num_bytes The number of bytes required to save the volume index - * - * @return UDS_SUCCESS or an error code. - **/ -int __must_check -compute_volume_index_save_bytes005(const struct configuration *config, - size_t *num_bytes); - -#endif /* VOLUMEINDEX005_H */ diff --git a/uds/volumeIndex006.h b/uds/volumeIndex006.h deleted file mode 100644 index 1b3416f5..00000000 --- a/uds/volumeIndex006.h +++ /dev/null @@ -1,55 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/krusty/src/uds/volumeIndex006.h#3 $ - */ - -#ifndef VOLUMEINDEX006_H -#define VOLUMEINDEX006_H 1 - -#include "volumeIndexOps.h" - -/** - * Make a new volume index. - * - * @param config The configuration of the volume index - * @param num_zones The number of zones - * @param volume_nonce The nonce used to authenticate the index - * @param volume_index Location to hold new volume index ptr - * - * @return error code or UDS_SUCCESS - **/ -int __must_check make_volume_index006(const struct configuration *config, - unsigned int num_zones, - uint64_t volume_nonce, - struct volume_index **volume_index); - -/** - * Compute the number of bytes required to save a volume index of a given - * configuration. - * - * @param config The configuration of the volume index - * @param num_bytes The number of bytes required to save the volume index - * - * @return UDS_SUCCESS or an error code. - **/ -int __must_check -compute_volume_index_save_bytes006(const struct configuration *config, - size_t *num_bytes); - -#endif /* VOLUMEINDEX006_H */ diff --git a/uds/volumeIndexOps.c b/uds/volumeIndexOps.c deleted file mode 100644 index e6cdbb88..00000000 --- a/uds/volumeIndexOps.c +++ /dev/null @@ -1,238 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/krusty/src/uds/volumeIndexOps.c#5 $ - */ -#include "volumeIndexOps.h" - -#include "compiler.h" -#include "errors.h" -#include "geometry.h" -#include "indexComponent.h" -#include "logger.h" -#include "volumeIndex005.h" -#include "volumeIndex006.h" -#include "memoryAlloc.h" -#include "permassert.h" -#include "uds.h" -#include "zone.h" - -/**********************************************************************/ -static INLINE bool uses_sparse(const struct configuration *config) -{ - return is_sparse(config->geometry); -} - -/**********************************************************************/ -void get_volume_index_combined_stats(const struct volume_index *volume_index, - struct volume_index_stats *stats) -{ - struct volume_index_stats dense, sparse; - get_volume_index_stats(volume_index, &dense, &sparse); - stats->memory_allocated = - dense.memory_allocated + sparse.memory_allocated; - stats->rebalance_time = dense.rebalance_time + sparse.rebalance_time; - stats->rebalance_count = - dense.rebalance_count + sparse.rebalance_count; - stats->record_count = dense.record_count + sparse.record_count; - stats->collision_count = - dense.collision_count + sparse.collision_count; - stats->discard_count = dense.discard_count + sparse.discard_count; - stats->overflow_count = dense.overflow_count + sparse.overflow_count; - stats->num_lists = dense.num_lists + sparse.num_lists; - stats->early_flushes = dense.early_flushes + sparse.early_flushes; -} - -/**********************************************************************/ -int make_volume_index(const struct configuration *config, - unsigned int num_zones, - uint64_t volume_nonce, - struct volume_index **volume_index) -{ - if (uses_sparse(config)) { - return make_volume_index006(config, num_zones, volume_nonce, - volume_index); - } else { - return make_volume_index005(config, num_zones, volume_nonce, - volume_index); - } -} - -/**********************************************************************/ -int compute_volume_index_save_blocks(const struct configuration *config, - size_t block_size, - uint64_t *block_count) -{ - size_t num_bytes; - int result = (uses_sparse(config) ? - compute_volume_index_save_bytes006(config, - &num_bytes) : - compute_volume_index_save_bytes005(config, - &num_bytes)); - if (result != UDS_SUCCESS) { - return result; - } - num_bytes += sizeof(struct delta_list_save_info); - *block_count = (num_bytes + block_size - 1) / block_size + MAX_ZONES; - return UDS_SUCCESS; -} - -/**********************************************************************/ -static int read_volume_index(struct read_portal *portal) -{ - struct volume_index *volume_index = - index_component_context(portal->component); - unsigned int num_zones = portal->zones; - struct buffered_reader *readers[MAX_ZONES]; - unsigned int z; - if (num_zones > MAX_ZONES) { - return uds_log_error_strerror(UDS_BAD_STATE, - "zone count %u must not exceed MAX_ZONES", - num_zones); - } - - for (z = 0; z < num_zones; ++z) { - int result = - get_buffered_reader_for_portal(portal, z, &readers[z]); - if (result != UDS_SUCCESS) { - return uds_log_error_strerror(result, - "cannot read component for zone %u", - z); - } - } - return restore_volume_index(readers, num_zones, volume_index); -} - -/**********************************************************************/ -static int write_volume_index(struct index_component *component, - struct buffered_writer *writer, - unsigned int zone, - enum incremental_writer_command command, - bool *completed) -{ - struct volume_index *volume_index = index_component_context(component); - bool is_complete = false; - - int result = UDS_SUCCESS; - - switch (command) { - case IWC_START: - result = start_saving_volume_index(volume_index, zone, writer); - is_complete = result != UDS_SUCCESS; - break; - case IWC_CONTINUE: - is_complete = is_saving_volume_index_done(volume_index, zone); - break; - case IWC_FINISH: - result = finish_saving_volume_index(volume_index, zone); - if (result == UDS_SUCCESS) { - result = write_guard_delta_list(writer); - } - is_complete = true; - break; - case IWC_ABORT: - result = abort_saving_volume_index(volume_index, zone); - is_complete = true; - break; - default: - result = uds_log_warning_strerror(UDS_INVALID_ARGUMENT, - "Invalid writer command"); - break; - } - if (completed != NULL) { - *completed = is_complete; - } - return result; -} - -/**********************************************************************/ - -static const struct index_component_info VOLUME_INDEX_INFO_DATA = { - .kind = RL_KIND_VOLUME_INDEX, - .name = "volume index", - .save_only = false, - .chapter_sync = false, - .multi_zone = true, - .io_storage = true, - .loader = read_volume_index, - .saver = NULL, - .incremental = write_volume_index, -}; -const struct index_component_info *const VOLUME_INDEX_INFO = - &VOLUME_INDEX_INFO_DATA; - -/**********************************************************************/ -static int restore_volume_index_body(struct buffered_reader **buffered_readers, - unsigned int num_readers, - struct volume_index *volume_index, - byte dl_data[DELTA_LIST_MAX_BYTE_COUNT]) -{ - unsigned int z; - // Start by reading the "header" section of the stream - int result = start_restoring_volume_index(volume_index, - buffered_readers, - num_readers); - if (result != UDS_SUCCESS) { - return result; - } - // Loop to read the delta lists, stopping when they have all been - // processed. - for (z = 0; z < num_readers; z++) { - for (;;) { - struct delta_list_save_info dlsi; - result = read_saved_delta_list(&dlsi, dl_data, - buffered_readers[z]); - if (result == UDS_END_OF_FILE) { - break; - } else if (result != UDS_SUCCESS) { - abort_restoring_volume_index(volume_index); - return result; - } - result = restore_delta_list_to_volume_index(volume_index, - &dlsi, - dl_data); - if (result != UDS_SUCCESS) { - abort_restoring_volume_index(volume_index); - return result; - } - } - } - if (!is_restoring_volume_index_done(volume_index)) { - abort_restoring_volume_index(volume_index); - return uds_log_warning_strerror(UDS_CORRUPT_COMPONENT, - "incomplete delta list data"); - } - return UDS_SUCCESS; -} - -/**********************************************************************/ -int restore_volume_index(struct buffered_reader **buffered_readers, - unsigned int num_readers, - struct volume_index *volume_index) -{ - byte *dl_data; - int result = - UDS_ALLOCATE(DELTA_LIST_MAX_BYTE_COUNT, byte, __func__, &dl_data); - if (result != UDS_SUCCESS) { - return result; - } - result = restore_volume_index_body(buffered_readers, num_readers, - volume_index, dl_data); - UDS_FREE(dl_data); - return result; -} diff --git a/uds/zone.c b/uds/zone.c deleted file mode 100644 index dcf76fbd..00000000 --- a/uds/zone.c +++ /dev/null @@ -1,45 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/krusty/src/uds/zone.c#8 $ - */ - -#include "zone.h" - -#include "logger.h" -#include "uds-threads.h" - -/**********************************************************************/ -unsigned int get_zone_count(const struct uds_parameters *user_params) -{ - unsigned int zone_count = - (user_params == NULL) ? 0 : user_params->zone_count; - if (zone_count == 0) { - zone_count = uds_get_num_cores() / 2; - } - if (zone_count < 1) { - zone_count = 1; - } - if (zone_count > MAX_ZONES) { - zone_count = MAX_ZONES; - } - uds_log_info("Using %u indexing zone%s for concurrency.", - zone_count, - zone_count == 1 ? "" : "s"); - return zone_count; -} diff --git a/uds/zone.h b/uds/zone.h deleted file mode 100644 index 028801c1..00000000 --- a/uds/zone.h +++ /dev/null @@ -1,43 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/krusty/src/uds/zone.h#5 $ - */ - -#ifndef ZONE_H -#define ZONE_H - -#include "compiler.h" -#include "uds.h" - -enum { - MAX_ZONES = 16, -}; - -/** - * Return the number of zones. - * - * @param user_params the index session parameters. If NULL, the default - * session parameters will be used. - * - * @return the number of zones - **/ -unsigned int __must_check -get_zone_count(const struct uds_parameters *user_params); - -#endif /* ZONE_H */ diff --git a/vdo/Makefile b/vdo/Makefile index 5ea68cd2..3b4f7098 100644 --- a/vdo/Makefile +++ b/vdo/Makefile @@ -1,30 +1,16 @@ -VDO_VERSION = 8.1.1.371 - -VDO_VERSION_MAJOR = $(word 1,$(subst ., ,$(VDO_VERSION))) -VDO_VERSION_MINOR = $(word 2,$(subst ., ,$(VDO_VERSION))) -VDO_VERSION_MICRO = $(word 3,$(subst ., ,$(VDO_VERSION))) +VDO_VERSION = 8.2.0.2 SOURCES = $(notdir $(wildcard $(src)/*.c)) OBJECTS = $(SOURCES:%.c=%.o) -INCLUDES = -I$(src)/base -I$(src)/kernel -I$(src)/../uds +INCLUDES = -I$(src) EXTRA_CFLAGS = -std=gnu99 \ -fno-builtin-memset \ -Werror \ $(if $(CONFIG_KASAN),,-Wframe-larger-than=400) \ - -Wno-declaration-after-statement \ - -DVDO_VERSION_MAJOR=$(VDO_VERSION_MAJOR) \ - -DVDO_VERSION_MINOR=$(VDO_VERSION_MINOR) \ - -DVDO_VERSION_MICRO=$(VDO_VERSION_MICRO) \ -DCURRENT_VERSION=\"$(VDO_VERSION)\" \ $(INCLUDES) -CFLAGS_REMOVE_vdoPageCache.o= -std=gnu99 -CFLAGS_REMOVE_vio.o= -std=gnu99 - -CFLAGS_vdoPageCache.o= -std=gnu89 -CFLAGS_vio.o= -std=gnu89 - obj-m += kvdo.o kvdo-objs = $(OBJECTS) diff --git a/vdo/action-manager.c b/vdo/action-manager.c new file mode 100644 index 00000000..6773881e --- /dev/null +++ b/vdo/action-manager.c @@ -0,0 +1,437 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright Red Hat + */ + +#include "action-manager.h" + +#include "memory-alloc.h" +#include "permassert.h" + +#include "admin-state.h" +#include "completion.h" +#include "kernel-types.h" +#include "status-codes.h" +#include "types.h" +#include "vdo.h" + +/** + * struct action - An action to be performed in each of a set of zones. + * @in_use: Whether this structure is in use. + * @operation: The admin operation associated with this action. + * @preamble: The method to run on the initiator thread before the action is + * applied to each zone. + * @zone_action: The action to be performed in each zone. + * @conclusion: The method to run on the initiator thread before the action is + * applied to each zone. + * @parent: The object to notify when the action is complete. + * @context: The action specific context. + * @next: The action to perform after this one. + */ +struct action { + bool in_use; + const struct admin_state_code *operation; + vdo_action_preamble *preamble; + vdo_zone_action *zone_action; + vdo_action_conclusion *conclusion; + struct vdo_completion *parent; + void *context; + struct action *next; +}; + +/** + * struct action_manager - Definition of an action manager. + * @completion: The completion for performing actions. + * @state: The state of this action manager. + * @actions: The two action slots. + * @current_action: The current action slot. + * @zones: The number of zones in which an action is to be applied. + * @Scheduler: A function to schedule a default next action. + * @get_zone_thread_id: A function to get the id of the thread on which + * to apply an action to a zone. + * @initiator_thread_id: The ID of the thread on which actions may be initiated. + * @context: Opaque data associated with this action manager. + * @acting_zone: The zone currently being acted upon. + */ +struct action_manager { + struct vdo_completion completion; + struct admin_state state; + struct action actions[2]; + struct action *current_action; + zone_count_t zones; + vdo_action_scheduler *scheduler; + vdo_zone_thread_getter *get_zone_thread_id; + thread_id_t initiator_thread_id; + void *context; + zone_count_t acting_zone; +}; + +static inline struct action_manager * +as_action_manager(struct vdo_completion *completion) +{ + vdo_assert_completion_type(completion->type, VDO_ACTION_COMPLETION); + return container_of(completion, struct action_manager, completion); +} + +/* + * Implements vdo_action_scheduler. + */ +static bool no_default_action(void *context __always_unused) +{ + return false; +} + +/* + * Implements vdo_action_preamble. + */ +static void no_preamble(void *context __always_unused, + struct vdo_completion *completion) +{ + vdo_complete_completion(completion); +} + +/* + * Implements vdo_action_conclusion. + */ +static int no_conclusion(void *context __always_unused) +{ + return VDO_SUCCESS; +} + +/** + * vdo_make_action_manager() - Make an action manager. + * @zones: The number of zones to which actions will be applied. + * @get_zone_thread_id: A function to get the thread id associated with a zone. + * @initiator_thread_id: The thread on which actions may initiated. + * @context: The object which holds the per-zone context for the action. + * @scheduler: A function to schedule a next action after an action concludes + * if there is no pending action (may be NULL). + * @vdo: The vdo used to initialize completions. + * @manager_ptr: A pointer to hold the new action manager. + * + * Return: VDO_SUCCESS or an error code. + */ +int vdo_make_action_manager(zone_count_t zones, + vdo_zone_thread_getter *get_zone_thread_id, + thread_id_t initiator_thread_id, + void *context, + vdo_action_scheduler *scheduler, + struct vdo *vdo, + struct action_manager **manager_ptr) +{ + struct action_manager *manager; + int result = UDS_ALLOCATE(1, struct action_manager, __func__, &manager); + + if (result != VDO_SUCCESS) { + return result; + } + + *manager = (struct action_manager) { + .zones = zones, + .scheduler = + ((scheduler == NULL) ? no_default_action : scheduler), + .get_zone_thread_id = get_zone_thread_id, + .initiator_thread_id = initiator_thread_id, + .context = context, + }; + + manager->actions[0].next = &manager->actions[1]; + manager->current_action = manager->actions[1].next = + &manager->actions[0]; + vdo_set_admin_state_code(&manager->state, + VDO_ADMIN_STATE_NORMAL_OPERATION); + vdo_initialize_completion(&manager->completion, vdo, + VDO_ACTION_COMPLETION); + *manager_ptr = manager; + return VDO_SUCCESS; +} + +const struct admin_state_code * +vdo_get_current_manager_operation(struct action_manager *manager) +{ + return vdo_get_admin_state_code(&manager->state); +} + +void *vdo_get_current_action_context(struct action_manager *manager) +{ + return (manager->current_action->in_use ? + manager->current_action->context : + NULL); +} + +static void finish_action_callback(struct vdo_completion *completion); +static void apply_to_zone(struct vdo_completion *completion); + +static thread_id_t get_acting_zone_thread_id(struct action_manager *manager) +{ + return manager->get_zone_thread_id(manager->context, + manager->acting_zone); +} + +static void prepare_for_next_zone(struct action_manager *manager) +{ + vdo_prepare_completion_for_requeue(&manager->completion, + apply_to_zone, + vdo_preserve_completion_error_and_continue, + get_acting_zone_thread_id(manager), + manager->current_action->parent); +} + +static void prepare_for_conclusion(struct action_manager *manager) +{ + vdo_prepare_completion_for_requeue(&manager->completion, + finish_action_callback, + vdo_preserve_completion_error_and_continue, + manager->initiator_thread_id, + manager->current_action->parent); +} + +static void apply_to_zone(struct vdo_completion *completion) +{ + zone_count_t zone; + struct action_manager *manager = as_action_manager(completion); + + ASSERT_LOG_ONLY((vdo_get_callback_thread_id() == + get_acting_zone_thread_id(manager)), + "apply_to_zone() called on acting zones's thread"); + + zone = manager->acting_zone++; + if (manager->acting_zone == manager->zones) { + /* + * We are about to apply to the last zone. Once that is + * finished, we're done, so go back to the initiator thread and + * finish up. + */ + prepare_for_conclusion(manager); + } else { + /* Prepare to come back on the next zone */ + prepare_for_next_zone(manager); + } + + manager->current_action->zone_action(manager->context, zone, completion); +} + +static void handle_preamble_error(struct vdo_completion *completion) +{ + /* Skip the zone actions since the preamble failed. */ + completion->callback = finish_action_callback; + vdo_preserve_completion_error_and_continue(completion); +} + +static void launch_current_action(struct action_manager *manager) +{ + struct action *action = manager->current_action; + int result = vdo_start_operation(&manager->state, action->operation); + + if (result != VDO_SUCCESS) { + if (action->parent != NULL) { + vdo_set_completion_result(action->parent, result); + } + + /* + * We aren't going to run the preamble, so don't run the + * conclusion + */ + action->conclusion = no_conclusion; + finish_action_callback(&manager->completion); + return; + } + + if (action->zone_action == NULL) { + prepare_for_conclusion(manager); + } else { + manager->acting_zone = 0; + vdo_prepare_completion_for_requeue(&manager->completion, + apply_to_zone, + handle_preamble_error, + get_acting_zone_thread_id(manager), + manager->current_action->parent); + } + + action->preamble(manager->context, &manager->completion); +} + +/** + * vdo_schedule_default_action() - Attempt to schedule the default action. + * @manager: The action manager. + * + * If the manager is not operating normally, the action will not be scheduled. + * + * Return: true if an action was scheduled. + */ +bool vdo_schedule_default_action(struct action_manager *manager) +{ + /* + * Don't schedule a default action if we are operating or not in normal + * operation. + */ + const struct admin_state_code *code + = vdo_get_current_manager_operation(manager); + return ((code == VDO_ADMIN_STATE_NORMAL_OPERATION) + && manager->scheduler(manager->context)); +} + +static void finish_action_callback(struct vdo_completion *completion) +{ + bool has_next_action; + int result; + struct action_manager *manager = as_action_manager(completion); + struct action action = *(manager->current_action); + + manager->current_action->in_use = false; + manager->current_action = manager->current_action->next; + + /* + * We need to check this now to avoid use-after-free issues if running + * the conclusion or notifying the parent results in the manager being + * freed. + */ + has_next_action = (manager->current_action->in_use + || vdo_schedule_default_action(manager)); + result = action.conclusion(manager->context); + vdo_finish_operation(&manager->state, VDO_SUCCESS); + if (action.parent != NULL) { + vdo_finish_completion(action.parent, result); + } + + if (has_next_action) { + launch_current_action(manager); + } +} + +/** + * vdo_schedule_action() - Schedule an action to be applied to all zones. + * @manager: The action manager to schedule the action on. + * @preamble: A method to be invoked on the initiator thread once this + * action is started but before applying to each zone; may be NULL. + * @action: The action to apply to each zone; may be NULL. + * @conclusion: A method to be invoked back on the initiator thread once + * the action has been applied to all zones; may be NULL. + * @parent: The object to notify once the action is complete or if + * the action can not be scheduled; may be NULL. + * + * The action will be launched immediately if there is no current + * action, or as soon as the current action completes. If there is + * already a pending action, this action will not be scheduled, and, + * if it has a parent, that parent will be notified. At least one of + * the preamble, action, or conclusion must not be NULL. + * + * Return: true if the action was scheduled. + */ +bool vdo_schedule_action(struct action_manager *manager, + vdo_action_preamble *preamble, + vdo_zone_action *action, + vdo_action_conclusion *conclusion, + struct vdo_completion *parent) +{ + return vdo_schedule_operation(manager, + VDO_ADMIN_STATE_OPERATING, + preamble, + action, + conclusion, + parent); +} + +/** + * vdo_schedule_operation() - Schedule an operation to be applied to all zones. + * @manager: The action manager to schedule the action on. + * @operation: The operation this action will perform + * @preamble: A method to be invoked on the initiator thread once this action + * is started but before applying to each zone; may be NULL. + * @action: The action to apply to each zone; may be NULL. + * @conclusion: A method to be invoked back on the initiator thread once the + * action has been applied to all zones; may be NULL. + * @parent: The object to notify once the action is complete or if the action + * can not be scheduled; may be NULL. + * + * The operation's action will be launched immediately if there is no + * current action, or as soon as the current action completes. If + * there is already a pending action, this operation will not be + * scheduled, and, if it has a parent, that parent will be notified. + * At least one of the preamble, action, or conclusion must not be + * NULL. + * + * Return: true if the action was scheduled. + */ +bool vdo_schedule_operation(struct action_manager *manager, + const struct admin_state_code *operation, + vdo_action_preamble *preamble, + vdo_zone_action *action, + vdo_action_conclusion *conclusion, + struct vdo_completion *parent) +{ + return vdo_schedule_operation_with_context(manager, + operation, + preamble, + action, + conclusion, + NULL, + parent); +} + +/** + * vdo_schedule_operation_with_context() - Schedule an operation on all zones. + * @manager: The action manager to schedule the action on. + * @operation: The operation this action will perform. + * @preamble: A method to be invoked on the initiator thread once this action + * is started but before applying to each zone; may be NULL. + * @action: The action to apply to each zone; may be NULL. + * @conclusion: A method to be invoked back on the initiator thread once the + * action has been applied to all zones; may be NULL. + * @context: An action-specific context which may be retrieved via + * vdo_get_current_action_context(); may be NULL. + * @parent: The object to notify once the action is complete or if the action + * can not be scheduled; may be NULL. + * + * The operation's action will be launched immediately if there is no + * current action, or as soon as the current action completes. If + * there is already a pending action, this operation will not be + * scheduled, and, if it has a parent, that parent will be notified. + * At least one of the preamble, action, or conclusion must not be + * NULL. + * + * Return: true if the action was scheduled + */ +bool +vdo_schedule_operation_with_context(struct action_manager *manager, + const struct admin_state_code *operation, + vdo_action_preamble *preamble, + vdo_zone_action *action, + vdo_action_conclusion *conclusion, + void *context, + struct vdo_completion *parent) +{ + struct action *current_action; + + ASSERT_LOG_ONLY((vdo_get_callback_thread_id() == + manager->initiator_thread_id), + "action initiated from correct thread"); + if (!manager->current_action->in_use) { + current_action = manager->current_action; + } else if (!manager->current_action->next->in_use) { + current_action = manager->current_action->next; + } else { + if (parent != NULL) { + vdo_finish_completion(parent, VDO_COMPONENT_BUSY); + } + + return false; + } + + *current_action = (struct action) { + .in_use = true, + .operation = operation, + .preamble = (preamble == NULL) ? no_preamble : preamble, + .zone_action = action, + .conclusion = (conclusion == NULL) ? no_conclusion : conclusion, + .context = context, + .parent = parent, + .next = current_action->next, + }; + + if (current_action == manager->current_action) { + launch_current_action(manager); + } + + return true; +} diff --git a/vdo/action-manager.h b/vdo/action-manager.h new file mode 100644 index 00000000..d89685e1 --- /dev/null +++ b/vdo/action-manager.h @@ -0,0 +1,118 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright Red Hat + */ + +#ifndef ACTION_MANAGER_H +#define ACTION_MANAGER_H + +#include "admin-state.h" +#include "completion.h" +#include "types.h" + +/* + * An action_manager provides a generic mechanism for applying actions to + * multi-zone entities (such as the block map or slab depot). Each action + * manager is tied to a specific context for which it manages actions. The + * manager ensures that only one action is active on that context at a time, + * and supports at most one pending action. Calls to schedule an action when + * there is already a pending action will result in VDO_COMPONENT_BUSY errors. + * Actions may only be submitted to the action manager from a single thread + * (which thread is determined when the action manager is constructed). + * + * A scheduled action consists of four components: + * + * preamble + * an optional method to be run on the initator thread before applying + * the action to all zones + * zone_action + * an optional method to be applied to each of the zones + * conclusion + * an optional method to be run on the initiator thread once the per-zone + * method has been applied to all zones + * parent + * an optional completion to be finished once the conclusion is done + * + * At least one of the three methods must be provided. + */ + +/* + * A function which is to be applied asynchronously to a set of zones. + * @context: The object which holds the per-zone context for the action + * @zone_number: The number of zone to which the action is being applied + * @parent: The object to notify when the action is complete + */ +typedef void vdo_zone_action(void *context, + zone_count_t zone_number, + struct vdo_completion *parent); + +/* + * A function which is to be applied asynchronously on an action manager's + * initiator thread as the preamble of an action. + * @context: The object which holds the per-zone context for the action + * @parent: The object to notify when the action is complete + */ +typedef void vdo_action_preamble(void *context, struct vdo_completion *parent); + +/* + * A function which will run on the action manager's initiator thread as the + * conclusion of an action. + * @context: The object which holds the per-zone context for the action + * + * Return: VDO_SUCCESS or an error + */ +typedef int vdo_action_conclusion(void *context); + +/* + * A function to schedule an action. + * @context: The object which holds the per-zone context for the action + * + * Return: true if an action was scheduled + */ +typedef bool vdo_action_scheduler(void *context); + +/* + * A function to get the id of the thread associated with a given zone. + * @context: The action context + * @zone_number: The number of the zone for which the thread ID is desired + */ +typedef thread_id_t vdo_zone_thread_getter(void *context, zone_count_t zone_number); + +int __must_check +vdo_make_action_manager(zone_count_t zones, + vdo_zone_thread_getter *get_zone_thread_id, + thread_id_t initiator_thread_id, + void *context, + vdo_action_scheduler *scheduler, + struct vdo *vdo, + struct action_manager **manager_ptr); + +const struct admin_state_code *__must_check +vdo_get_current_manager_operation(struct action_manager *manager); + +void * __must_check vdo_get_current_action_context(struct action_manager *manager); + +bool vdo_schedule_default_action(struct action_manager *manager); + +bool vdo_schedule_action(struct action_manager *manager, + vdo_action_preamble *preamble, + vdo_zone_action *action, + vdo_action_conclusion *conclusion, + struct vdo_completion *parent); + +bool vdo_schedule_operation(struct action_manager *manager, + const struct admin_state_code *operation, + vdo_action_preamble *preamble, + vdo_zone_action *action, + vdo_action_conclusion *conclusion, + struct vdo_completion *parent); + +bool vdo_schedule_operation_with_context(struct action_manager *manager, + const struct admin_state_code *operation, + vdo_action_preamble *preamble, + vdo_zone_action *action, + vdo_action_conclusion *conclusion, + void *context, + struct vdo_completion *parent); + +#endif /* ACTION_MANAGER_H */ diff --git a/vdo/actionManager.c b/vdo/actionManager.c deleted file mode 100644 index 3cd90c7f..00000000 --- a/vdo/actionManager.c +++ /dev/null @@ -1,418 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/actionManager.c#19 $ - */ - -#include "actionManager.h" - -#include "memoryAlloc.h" -#include "permassert.h" - -#include "adminState.h" -#include "completion.h" -#include "statusCodes.h" -#include "types.h" -#include "vdo.h" - -/** An action to be performed in each of a set of zones */ -struct action { - /** Whether this structure is in use */ - bool in_use; - /** The admin operation associated with this action */ - const struct admin_state_code *operation; - /** - * The method to run on the initiator thread before the action is - * applied to each zone. - **/ - vdo_action_preamble *preamble; - /** The action to be performed in each zone */ - vdo_zone_action *zone_action; - /** - * The method to run on the initiator thread after the action has been - * applied to each zone - **/ - vdo_action_conclusion *conclusion; - /** The object to notify when the action is complete */ - struct vdo_completion *parent; - /** The action specific context */ - void *context; - /** The action to perform after this one */ - struct action *next; -}; - -struct action_manager { - /** The completion for performing actions */ - struct vdo_completion completion; - /** The state of this action manager */ - struct admin_state state; - /** The two action slots */ - struct action actions[2]; - /** The current action slot */ - struct action *current_action; - /** The number of zones in which an action is to be applied */ - zone_count_t zones; - /** A function to schedule a default next action */ - vdo_action_scheduler *scheduler; - /** - * A function to get the id of the thread on which to apply an action - * to a zone - **/ - vdo_zone_thread_getter *get_zone_thread_id; - /** The ID of the thread on which actions may be initiated */ - thread_id_t initiator_thread_id; - /** Opaque data associated with this action manager */ - void *context; - /** The zone currently being acted upon */ - zone_count_t acting_zone; -}; - -/** - * Convert a generic vdo_completion to a action_manager. - * - * @param completion The completion to convert - * - * @return The completion as an action_manager - **/ -static inline struct action_manager * -as_action_manager(struct vdo_completion *completion) -{ - assert_vdo_completion_type(completion->type, VDO_ACTION_COMPLETION); - return container_of(completion, struct action_manager, completion); -} - -/** - * An action scheduler which does not schedule an action. - * - *

Implements vdo_action_scheduler. - **/ -static bool no_default_action(void *context __always_unused) -{ - return false; -} - -/** - * A default preamble which does nothing. - * - *

Implements vdo_action_preamble - **/ -static void no_preamble(void *context __always_unused, - struct vdo_completion *completion) -{ - complete_vdo_completion(completion); -} - -/** - * A default conclusion which does nothing. - * - *

Implements vdo_action_conclusion. - **/ -static int no_conclusion(void *context __always_unused) -{ - return VDO_SUCCESS; -} - -/**********************************************************************/ -int make_vdo_action_manager(zone_count_t zones, - vdo_zone_thread_getter *get_zone_thread_id, - thread_id_t initiator_thread_id, - void *context, - vdo_action_scheduler *scheduler, - struct vdo *vdo, - struct action_manager **manager_ptr) -{ - struct action_manager *manager; - int result = UDS_ALLOCATE(1, struct action_manager, __func__, &manager); - if (result != VDO_SUCCESS) { - return result; - } - - *manager = (struct action_manager) { - .zones = zones, - .scheduler = - ((scheduler == NULL) ? no_default_action : scheduler), - .get_zone_thread_id = get_zone_thread_id, - .initiator_thread_id = initiator_thread_id, - .context = context, - }; - - manager->actions[0].next = &manager->actions[1]; - manager->current_action = manager->actions[1].next = - &manager->actions[0]; - set_vdo_admin_state_code(&manager->state, - VDO_ADMIN_STATE_NORMAL_OPERATION); - initialize_vdo_completion(&manager->completion, vdo, - VDO_ACTION_COMPLETION); - *manager_ptr = manager; - return VDO_SUCCESS; -} - -/**********************************************************************/ -const struct admin_state_code * -get_current_vdo_manager_operation(struct action_manager *manager) -{ - return get_vdo_admin_state_code(&manager->state); -} - -/**********************************************************************/ -void *get_current_vdo_action_context(struct action_manager *manager) -{ - return (manager->current_action->in_use ? - manager->current_action->context : - NULL); -} - -/**********************************************************************/ -static void finish_action_callback(struct vdo_completion *completion); -static void apply_to_zone(struct vdo_completion *completion); - -/** - * Get the thread ID for the current zone. - * - * @param manager The action manager - * - * @return The ID of the thread on which to run actions for the current zone - **/ -static thread_id_t get_acting_zone_thread_id(struct action_manager *manager) -{ - return manager->get_zone_thread_id(manager->context, - manager->acting_zone); -} - -/** - * Prepare the manager's completion to run on the next zone. - * - * @param manager The action manager - **/ -static void prepare_for_next_zone(struct action_manager *manager) -{ - prepare_vdo_completion_for_requeue(&manager->completion, - apply_to_zone, - preserve_vdo_completion_error_and_continue, - get_acting_zone_thread_id(manager), - manager->current_action->parent); -} - -/** - * Prepare the manager's completion to run the conclusion on the initiator - * thread. - * - * @param manager The action manager - **/ -static void prepare_for_conclusion(struct action_manager *manager) -{ - prepare_vdo_completion_for_requeue(&manager->completion, - finish_action_callback, - preserve_vdo_completion_error_and_continue, - manager->initiator_thread_id, - manager->current_action->parent); -} - -/** - * Perform an action on the next zone if there is one. - * - * @param completion The action completion - **/ -static void apply_to_zone(struct vdo_completion *completion) -{ - zone_count_t zone; - struct action_manager *manager = as_action_manager(completion); - ASSERT_LOG_ONLY((vdo_get_callback_thread_id() == - get_acting_zone_thread_id(manager)), - "apply_to_zone() called on acting zones's thread"); - - zone = manager->acting_zone++; - if (manager->acting_zone == manager->zones) { - // We are about to apply to the last zone. Once that is - // finished, we're done, so go back to the initiator thread and - // finish up. - prepare_for_conclusion(manager); - } else { - // Prepare to come back on the next zone - prepare_for_next_zone(manager); - } - - manager->current_action->zone_action(manager->context, zone, completion); -} - -/** - * The error handler for preamble errors. - * - * @param completion The manager completion - **/ -static void handle_preamble_error(struct vdo_completion *completion) -{ - // Skip the zone actions since the preamble failed. - completion->callback = finish_action_callback; - preserve_vdo_completion_error_and_continue(completion); -} - -/** - * Launch the current action. - * - * @param manager The action manager - **/ -static void launch_current_action(struct action_manager *manager) -{ - struct action *action = manager->current_action; - int result = start_vdo_operation(&manager->state, action->operation); - if (result != VDO_SUCCESS) { - if (action->parent != NULL) { - set_vdo_completion_result(action->parent, result); - } - - // We aren't going to run the preamble, so don't run the - // conclusion - action->conclusion = no_conclusion; - finish_action_callback(&manager->completion); - return; - } - - if (action->zone_action == NULL) { - prepare_for_conclusion(manager); - } else { - manager->acting_zone = 0; - prepare_vdo_completion_for_requeue(&manager->completion, - apply_to_zone, - handle_preamble_error, - get_acting_zone_thread_id(manager), - manager->current_action->parent); - } - - action->preamble(manager->context, &manager->completion); -} - -/**********************************************************************/ -bool schedule_vdo_default_action(struct action_manager *manager) -{ - // Don't schedule a default action if we are operating or not in normal - // operation. - const struct admin_state_code *code - = get_current_vdo_manager_operation(manager); - return ((code == VDO_ADMIN_STATE_NORMAL_OPERATION) - && manager->scheduler(manager->context)); -} - -/** - * Finish an action now that it has been applied to all zones. This - * callback is registered in apply_to_zone(). - * - * @param completion The action manager completion - **/ -static void finish_action_callback(struct vdo_completion *completion) -{ - bool has_next_action; - int result; - struct action_manager *manager = as_action_manager(completion); - struct action action = *(manager->current_action); - manager->current_action->in_use = false; - manager->current_action = manager->current_action->next; - - /* - * We need to check this now to avoid use-after-free issues if running - * the conclusion or notifying the parent results in the manager being - * freed. - */ - has_next_action = (manager->current_action->in_use - || schedule_vdo_default_action(manager)); - result = action.conclusion(manager->context); - finish_vdo_operation(&manager->state, VDO_SUCCESS); - if (action.parent != NULL) { - finish_vdo_completion(action.parent, result); - } - - if (has_next_action) { - launch_current_action(manager); - } -} - -/**********************************************************************/ -bool schedule_vdo_action(struct action_manager *manager, - vdo_action_preamble *preamble, - vdo_zone_action *action, - vdo_action_conclusion *conclusion, - struct vdo_completion *parent) -{ - return schedule_vdo_operation(manager, - VDO_ADMIN_STATE_OPERATING, - preamble, - action, - conclusion, - parent); -} - -/**********************************************************************/ -bool schedule_vdo_operation(struct action_manager *manager, - const struct admin_state_code *operation, - vdo_action_preamble *preamble, - vdo_zone_action *action, - vdo_action_conclusion *conclusion, - struct vdo_completion *parent) -{ - return schedule_vdo_operation_with_context(manager, - operation, - preamble, - action, - conclusion, - NULL, - parent); -} - -/**********************************************************************/ -bool -schedule_vdo_operation_with_context(struct action_manager *manager, - const struct admin_state_code *operation, - vdo_action_preamble *preamble, - vdo_zone_action *action, - vdo_action_conclusion *conclusion, - void *context, - struct vdo_completion *parent) -{ - struct action *current_action; - ASSERT_LOG_ONLY((vdo_get_callback_thread_id() == - manager->initiator_thread_id), - "action initiated from correct thread"); - if (!manager->current_action->in_use) { - current_action = manager->current_action; - } else if (!manager->current_action->next->in_use) { - current_action = manager->current_action->next; - } else { - if (parent != NULL) { - finish_vdo_completion(parent, VDO_COMPONENT_BUSY); - } - - return false; - } - - *current_action = (struct action) { - .in_use = true, - .operation = operation, - .preamble = (preamble == NULL) ? no_preamble : preamble, - .zone_action = action, - .conclusion = (conclusion == NULL) ? no_conclusion : conclusion, - .context = context, - .parent = parent, - .next = current_action->next, - }; - - if (current_action == manager->current_action) { - launch_current_action(manager); - } - - return true; -} diff --git a/vdo/actionManager.h b/vdo/actionManager.h deleted file mode 100644 index dc42d37d..00000000 --- a/vdo/actionManager.h +++ /dev/null @@ -1,241 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/actionManager.h#9 $ - */ - -#ifndef ACTION_MANAGER_H -#define ACTION_MANAGER_H - -#include "adminState.h" -#include "completion.h" -#include "types.h" - -/** - * An action_manager provides a generic mechanism for applying actions to - * multi-zone entities (such as the block map or slab depot). Each action - * manager is tied to a specific context for which it manages actions. The - * manager ensures that only one action is active on that context at a time, - * and supports at most one pending action. Calls to schedule an action when - * there is already a pending action will result in VDO_COMPONENT_BUSY errors. - * Actions may only be submitted to the action manager from a single thread - * (which thread is determined when the action manager is constructed). - * - * A scheduled action consists of four components: - * preamble: an optional method to be run on the initator thread before - * applying the action to all zones - * zone_action: an optional method to be applied to each of the zones - * conclusion: an optional method to be run on the initiator thread once the - * per-zone method has been applied to all zones - * parent: an optional completion to be finished once the conclusion - * is done - * - * At least one of the three methods must be provided. - **/ - -/** - * A function which is to be applied asynchronously to a set of zones. - * - * @param context The object which holds the per-zone context for the - * action - * @param zone_number The number of zone to which the action is being applied - * @param parent The object to notify when the action is complete - **/ -typedef void vdo_zone_action(void *context, - zone_count_t zone_number, - struct vdo_completion *parent); - -/** - * A function which is to be applied asynchronously on an action manager's - * initiator thread as the preamble of an action. - * - * @param context The object which holds the per-zone context for the action - * @param parent The object to notify when the action is complete - **/ -typedef void vdo_action_preamble(void *context, struct vdo_completion *parent); - -/** - * A function which will run on the action manager's initiator thread as the - * conclusion of an action. - * - * @param context The object which holds the per-zone context for the action - * - * @return VDO_SUCCESS or an error - **/ -typedef int vdo_action_conclusion(void *context); - -/** - * A function to schedule an action. - * - * @param context The object which holds the per-zone context for the action - * - * @return true if an action was scheduled - **/ -typedef bool vdo_action_scheduler(void *context); - -/** - * Get the id of the thread associated with a given zone. - * - * @param context The action context - * @param zone_number The number of the zone for which the thread ID is desired - **/ -typedef thread_id_t vdo_zone_thread_getter(void *context, zone_count_t zone_number); - -/** - * Make an action manager. - * - * @param [in] zones The number of zones to which actions will - * be applied - * @param [in] get_zone_thread_id A function to get the thread id associated - * with a zone - * @param [in] initiator_thread_id The thread on which actions may initiated - * @param [in] context The object which holds the per-zone context - * for the action - * @param [in] scheduler A function to schedule a next action after - * an action concludes if there is no pending - * action (may be NULL) - * @param [in] vdo The vdo used to initialize completions - * @param [out] manager_ptr A pointer to hold the new action manager - * - * @return VDO_SUCCESS or an error code - **/ -int __must_check -make_vdo_action_manager(zone_count_t zones, - vdo_zone_thread_getter *get_zone_thread_id, - thread_id_t initiator_thread_id, - void *context, - vdo_action_scheduler *scheduler, - struct vdo *vdo, - struct action_manager **manager_ptr); - -/** - * Get the current operation an action manager is performing. - * - * @param manager The manager to query - * - * @return The manager's current operation - **/ -const struct admin_state_code *__must_check -get_current_vdo_manager_operation(struct action_manager *manager); - -/** - * Get the action-specific context for the operation an action manager is - * currently performing. - * - * @param manager The manager to query - * - * @return The action-specific context for the manager's current action or - * NULL if there is no context or no current action - **/ -void * __must_check get_current_vdo_action_context(struct action_manager *manager); - -/** - * Attempt to schedule the default action. If the manager is not operating - * normally, the action will not be scheduled. - * - * @param manager The action manager - * - * @return true if an action was scheduled. - **/ -bool schedule_vdo_default_action(struct action_manager *manager); - -/** - * Schedule an action to be applied to all zones. The action will be launched - * immediately if there is no current action, or as soon as the current action - * completes. If there is already a pending action, this action will not be - * scheduled, and, if it has a parent, that parent will be notified. At least - * one of the preamble, action, or conclusion must not be NULL. - * - * @param manager The action manager to schedule the action on - * @param preamble A method to be invoked on the initiator thread once this - * action is started but before applying to each zone; may - * be NULL - * @param action The action to apply to each zone; may be NULL - * @param conclusion A method to be invoked back on the initiator thread once - * the action has been applied to all zones; may be NULL - * @param parent The object to notify once the action is complete or if - * the action can not be scheduled; may be NULL - * - * @return true if the action was scheduled - **/ -bool schedule_vdo_action(struct action_manager *manager, - vdo_action_preamble *preamble, - vdo_zone_action *action, - vdo_action_conclusion *conclusion, - struct vdo_completion *parent); - -/** - * Schedule an operation to be applied to all zones. The operation's action - * will be launched immediately if there is no current action, or as soon as - * the current action completes. If there is already a pending action, this - * operation will not be scheduled, and, if it has a parent, that parent will - * be notified. At least one of the preamble, action, or conclusion must not - * be NULL. - * - * @param manager The action manager to schedule the action on - * @param operation The operation this action will perform - * @param preamble A method to be invoked on the initiator thread once this - * action is started but before applying to each zone; may - * be NULL - * @param action The action to apply to each zone; may be NULL - * @param conclusion A method to be invoked back on the initiator thread once - * the action has been applied to all zones; may be NULL - * @param parent The object to notify once the action is complete or if - * the action can not be scheduled; may be NULL - * - * @return true if the action was scheduled - **/ -bool schedule_vdo_operation(struct action_manager *manager, - const struct admin_state_code *operation, - vdo_action_preamble *preamble, - vdo_zone_action *action, - vdo_action_conclusion *conclusion, - struct vdo_completion *parent); - -/** - * Schedule an operation to be applied to all zones. The operation's action - * will be launched immediately if there is no current action, or as soon as - * the current action completes. If there is already a pending action, this - * operation will not be scheduled, and, if it has a parent, that parent will - * be notified. At least one of the preamble, action, or conclusion must not - * be NULL. - * - * @param manager The action manager to schedule the action on - * @param operation The operation this action will perform - * @param preamble A method to be invoked on the initiator thread once this - * action is started but before applying to each zone; may - * be NULL - * @param action The action to apply to each zone; may be NULL - * @param conclusion A method to be invoked back on the initiator thread once - * the action has been applied to all zones; may be NULL - * @param context An action-specific context which may be retrieved via - * get_current_vdo_action_context(); may be NULL - * @param parent The object to notify once the action is complete or if - * the action can not be scheduled; may be NULL - * - * @return true if the action was scheduled - **/ -bool schedule_vdo_operation_with_context(struct action_manager *manager, - const struct admin_state_code *operation, - vdo_action_preamble *preamble, - vdo_zone_action *action, - vdo_action_conclusion *conclusion, - void *context, - struct vdo_completion *parent); - -#endif // ACTION_MANAGER_H diff --git a/vdo/adminCompletion.c b/vdo/admin-completion.c similarity index 50% rename from vdo/adminCompletion.c rename to vdo/admin-completion.c index 3db3c34b..e55533de 100644 --- a/vdo/adminCompletion.c +++ b/vdo/admin-completion.c @@ -1,39 +1,23 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/adminCompletion.c#25 $ */ -#include "adminCompletion.h" +#include "admin-completion.h" #include #include #include "logger.h" -#include "memoryAlloc.h" +#include "memory-alloc.h" #include "permassert.h" #include "completion.h" +#include "thread-config.h" #include "types.h" -#include "vdoInternal.h" +#include "vdo.h" -/**********************************************************************/ -void assert_vdo_admin_operation_type(struct admin_completion *completion, +void vdo_assert_admin_operation_type(struct admin_completion *completion, enum admin_operation_type expected) { ASSERT_LOG_ONLY(completion->type == expected, @@ -42,18 +26,20 @@ void assert_vdo_admin_operation_type(struct admin_completion *completion, expected); } -/**********************************************************************/ struct admin_completion * vdo_admin_completion_from_sub_task(struct vdo_completion *completion) { struct vdo_completion *parent = completion->parent; - assert_vdo_completion_type(completion->type, VDO_SUB_TASK_COMPLETION); - assert_vdo_completion_type(parent->type, VDO_ADMIN_COMPLETION); + + vdo_assert_completion_type(completion->type, VDO_SUB_TASK_COMPLETION); + vdo_assert_completion_type(parent->type, VDO_ADMIN_COMPLETION); return container_of(parent, struct admin_completion, completion); } -/**********************************************************************/ -void assert_vdo_admin_phase_thread(struct admin_completion *admin_completion, +/* + * Assert that we are operating on the correct thread for the current phase. + */ +void vdo_assert_admin_phase_thread(struct admin_completion *admin_completion, const char *what, const char *phase_names[]) { @@ -65,84 +51,87 @@ void assert_vdo_admin_phase_thread(struct admin_completion *admin_completion, phase_names[admin_completion->phase]); } -/**********************************************************************/ struct vdo *vdo_from_admin_sub_task(struct vdo_completion *completion, enum admin_operation_type expected) { struct admin_completion *admin_completion = vdo_admin_completion_from_sub_task(completion); - assert_vdo_admin_operation_type(admin_completion, expected); + vdo_assert_admin_operation_type(admin_completion, expected); return admin_completion->vdo; } -/**********************************************************************/ -void initialize_vdo_admin_completion(struct vdo *vdo, +void vdo_initialize_admin_completion(struct vdo *vdo, struct admin_completion *admin_completion) { admin_completion->vdo = vdo; - initialize_vdo_completion(&admin_completion->completion, vdo, + vdo_initialize_completion(&admin_completion->completion, vdo, VDO_ADMIN_COMPLETION); - initialize_vdo_completion(&admin_completion->sub_task_completion, vdo, + vdo_initialize_completion(&admin_completion->sub_task_completion, vdo, VDO_SUB_TASK_COMPLETION); init_completion(&admin_completion->callback_sync); atomic_set(&admin_completion->busy, 0); } -/**********************************************************************/ struct vdo_completion * -reset_vdo_admin_sub_task(struct vdo_completion *completion) +vdo_reset_admin_sub_task(struct vdo_completion *completion) { struct admin_completion *admin_completion = vdo_admin_completion_from_sub_task(completion); - reset_vdo_completion(completion); + vdo_reset_completion(completion); completion->callback_thread_id = admin_completion->get_thread_id(admin_completion); + completion->requeue = true; return completion; } -/**********************************************************************/ -void prepare_vdo_admin_sub_task_on_thread(struct vdo *vdo, - vdo_action *callback, - vdo_action *error_handler, - thread_id_t thread_id) +static void vdo_prepare_admin_sub_task_on_thread(struct vdo *vdo, + vdo_action *callback, + vdo_action *error_handler, + thread_id_t thread_id) { - prepare_vdo_completion_for_requeue(&vdo->admin_completion.sub_task_completion, + vdo_prepare_completion_for_requeue(&vdo->admin_completion.sub_task_completion, callback, error_handler, thread_id, &vdo->admin_completion.completion); } -/**********************************************************************/ -void prepare_vdo_admin_sub_task(struct vdo *vdo, +/* + * Prepare the sub-task completion to run on the same thread as its enclosing + * completion. + */ +void vdo_prepare_admin_sub_task(struct vdo *vdo, vdo_action *callback, vdo_action *error_handler) { struct admin_completion *admin_completion = &vdo->admin_completion; - prepare_vdo_admin_sub_task_on_thread(vdo, + + vdo_prepare_admin_sub_task_on_thread(vdo, callback, error_handler, admin_completion->completion.callback_thread_id); } -/** - * Callback for admin operations which will notify the layer that the operation - * is complete. - * - * @param vdo_completion The vdo_completion within the admin completion - **/ static void admin_operation_callback(struct vdo_completion *vdo_completion) { struct admin_completion *completion; - assert_vdo_completion_type(vdo_completion->type, VDO_ADMIN_COMPLETION); + + vdo_assert_completion_type(vdo_completion->type, VDO_ADMIN_COMPLETION); completion = container_of(vdo_completion, struct admin_completion, completion); complete(&completion->callback_sync); } -/**********************************************************************/ +/* + * Perform an administrative operation (load, suspend, grow logical, or grow + * physical). This method should not be called from base threads unless it is + * certain the calling thread won't be needed to perform the operation. It may + * (and should) be called from non-base threads. + * + * FIXME: does this base thread note apply anymore? + */ int -perform_vdo_admin_operation(struct vdo *vdo, +vdo_perform_admin_operation(struct vdo *vdo, enum admin_operation_type type, vdo_thread_id_getter_for_phase *thread_id_getter, vdo_action *action, @@ -150,30 +139,35 @@ perform_vdo_admin_operation(struct vdo *vdo, { int result; struct admin_completion *admin_completion = &vdo->admin_completion; + if (atomic_cmpxchg(&admin_completion->busy, 0, 1) != 0) { return uds_log_error_strerror(VDO_COMPONENT_BUSY, "Can't start admin operation of type %u, another operation is already in progress", type); } - prepare_vdo_completion(&admin_completion->completion, + vdo_prepare_completion(&admin_completion->completion, admin_operation_callback, admin_operation_callback, - get_vdo_thread_config(vdo)->admin_thread, + vdo->thread_config->admin_thread, NULL); admin_completion->type = type; admin_completion->get_thread_id = thread_id_getter; admin_completion->phase = 0; - prepare_vdo_admin_sub_task(vdo, action, error_handler); + vdo_prepare_admin_sub_task(vdo, action, error_handler); reinit_completion(&admin_completion->callback_sync); - enqueue_vdo_completion(&admin_completion->sub_task_completion); + vdo_enqueue_completion(&admin_completion->sub_task_completion); - // Using the "interruptible" interface means that Linux will not log a - // message when we wait for more than 120 seconds. + /* + * Using the "interruptible" interface means that Linux will not log a + * message when we wait for more than 120 seconds. + */ while (wait_for_completion_interruptible(&admin_completion->callback_sync) != 0) { - // However, if we get a signal in a user-mode process, we could - // spin... - msleep(1); + /* + * However, if we get a signal in a user-mode process, we could + * spin... + */ + fsleep(1000); } result = admin_completion->completion.result; diff --git a/vdo/admin-completion.h b/vdo/admin-completion.h new file mode 100644 index 00000000..29507c1a --- /dev/null +++ b/vdo/admin-completion.h @@ -0,0 +1,78 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright Red Hat + */ + +#ifndef ADMIN_COMPLETION_H +#define ADMIN_COMPLETION_H + +#include +#include + +#include "uds-threads.h" + +#include "completion.h" +#include "types.h" + +enum admin_operation_type { + VDO_ADMIN_OPERATION_UNKNOWN, + VDO_ADMIN_OPERATION_GROW_LOGICAL, + VDO_ADMIN_OPERATION_GROW_PHYSICAL, + VDO_ADMIN_OPERATION_PREPARE_GROW_PHYSICAL, + VDO_ADMIN_OPERATION_LOAD, + VDO_ADMIN_OPERATION_PRE_LOAD, + VDO_ADMIN_OPERATION_RESUME, + VDO_ADMIN_OPERATION_SUSPEND, +}; + +struct admin_completion; + +typedef thread_id_t +vdo_thread_id_getter_for_phase(struct admin_completion *admin_completion); + +struct admin_completion { + /* + * FIXME this field should be replaced by container_of() when + * enqueuables go away and this becomes a field of struct vdo. + */ + struct vdo *vdo; + struct vdo_completion completion; + struct vdo_completion sub_task_completion; + atomic_t busy; + enum admin_operation_type type; + vdo_thread_id_getter_for_phase *get_thread_id; + uint32_t phase; + struct completion callback_sync; +}; + +void vdo_assert_admin_operation_type(struct admin_completion *completion, + enum admin_operation_type expected); + +struct admin_completion * __must_check +vdo_admin_completion_from_sub_task(struct vdo_completion *completion); + +void vdo_assert_admin_phase_thread(struct admin_completion *admin_completion, + const char *what, + const char *phase_names[]); + +struct vdo * __must_check +vdo_from_admin_sub_task(struct vdo_completion *completion, + enum admin_operation_type expected); + +void vdo_initialize_admin_completion(struct vdo *vdo, + struct admin_completion *admin_completion); + +struct vdo_completion *vdo_reset_admin_sub_task(struct vdo_completion *completion); + +void vdo_prepare_admin_sub_task(struct vdo *vdo, + vdo_action *callback, + vdo_action *error_handler); + +int __must_check +vdo_perform_admin_operation(struct vdo *vdo, + enum admin_operation_type type, + vdo_thread_id_getter_for_phase *thread_id_getter, + vdo_action *action, + vdo_action *error_handler); + +#endif /* ADMIN_COMPLETION_H */ diff --git a/vdo/adminState.c b/vdo/admin-state.c similarity index 50% rename from vdo/adminState.c rename to vdo/admin-state.c index 88fad8b0..eb42259b 100644 --- a/vdo/adminState.c +++ b/vdo/admin-state.c @@ -1,36 +1,20 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/adminState.c#26 $ */ -#include "adminState.h" +#include "admin-state.h" #include "logger.h" -#include "memoryAlloc.h" +#include "memory-alloc.h" #include "permassert.h" #include "completion.h" #include "types.h" -/** +/* * The state codes. - **/ + */ static const struct admin_state_code VDO_CODE_NORMAL_OPERATION = { .name = "VDO_ADMIN_STATE_NORMAL_OPERATION", .normal = true, @@ -93,7 +77,6 @@ const struct admin_state_code *VDO_ADMIN_STATE_WAITING_FOR_RECOVERY = &VDO_CODE_WAITING_FOR_RECOVERY; static const struct admin_state_code VDO_CODE_NEW = { .name = "VDO_ADMIN_STATE_NEW", - .normal = true, .quiescent = true, }; const struct admin_state_code *VDO_ADMIN_STATE_NEW = @@ -146,6 +129,20 @@ static const struct admin_state_code VDO_CODE_SAVE_FOR_SCRUBBING = { }; const struct admin_state_code *VDO_ADMIN_STATE_SAVE_FOR_SCRUBBING = &VDO_CODE_SAVE_FOR_SCRUBBING; +static const struct admin_state_code VDO_CODE_STOPPING = { + .name = "VDO_ADMIN_STATE_STOPPING", + .draining = true, + .quiescing = true, + .operating = true, +}; +const struct admin_state_code *VDO_ADMIN_STATE_STOPPING = + &VDO_CODE_STOPPING; +static const struct admin_state_code VDO_CODE_STOPPED = { + .name = "VDO_ADMIN_STATE_STOPPED", + .quiescent = true, +}; +const struct admin_state_code *VDO_ADMIN_STATE_STOPPED = + &VDO_CODE_STOPPED; static const struct admin_state_code VDO_CODE_SUSPENDING = { .name = "VDO_ADMIN_STATE_SUSPENDING", .draining = true, @@ -173,60 +170,34 @@ static const struct admin_state_code VDO_CODE_RESUMING = { const struct admin_state_code *VDO_ADMIN_STATE_RESUMING = &VDO_CODE_RESUMING; -/**********************************************************************/ -const char *get_vdo_admin_state_code_name(const struct admin_state_code *code) -{ - return code->name; -} - -/**********************************************************************/ -const char *get_vdo_admin_state_name(const struct admin_state *state) -{ - return get_vdo_admin_state_code_name(get_vdo_admin_state_code(state)); -} - /** - * Check whether an admin_state is operating. + * is_vdo_state_operating() - Check whether an admin_state is operating. + * @state The admin_state to query. * - * @param state The admin_state to query - * - * @return true if the state is operating - **/ + * Return: true if the state is operating. + */ static inline bool __must_check is_vdo_state_operating(const struct admin_state *state) { - return get_vdo_admin_state_code(state)->operating; + return vdo_get_admin_state_code(state)->operating; } /** - * Check whether an admin_state_code is a quiescent operation. - * - * @param code The code to check + * get_next_state() - Determine the state which should be set after a given + * operation completes based on the operation and the + * current state. + * @state The admin_state. + * @operation The operation to be started. * - * @return true if the code is a quiescent operation - **/ -static inline bool __must_check -is_vdo_quiescent_operation(const struct admin_state_code *code) -{ - return (code->quiescent && code->operating); -} - -/** - * Determine the state which should be set after a given operation completes - * based on the operation and the current state. - * - * @param state The admin_state - * @param operation The operation to be started - * - * @return The state to set when the operation completes or NULL if the - * operation can not be started in the current state - **/ + * Return: The state to set when the operation completes or NULL if the + * operation can not be started in the current state. + */ static const struct admin_state_code * get_next_state(const struct admin_state *state, const struct admin_state_code *operation) { const struct admin_state_code *code - = get_vdo_admin_state_code(state); + = vdo_get_admin_state_code(state); if (code->operating) { return NULL; @@ -244,6 +215,12 @@ get_next_state(const struct admin_state *state, : NULL); } + if (operation == VDO_ADMIN_STATE_STOPPING) { + return (code == VDO_ADMIN_STATE_NORMAL_OPERATION + ? VDO_ADMIN_STATE_STOPPED + : NULL); + } + if (operation == VDO_ADMIN_STATE_PRE_LOADING) { return (code == VDO_ADMIN_STATE_INITIALIZED ? VDO_ADMIN_STATE_PRE_LOADED @@ -260,8 +237,19 @@ get_next_state(const struct admin_state *state, return VDO_ADMIN_STATE_NORMAL_OPERATION; } -/**********************************************************************/ -bool finish_vdo_operation(struct admin_state *state, int result) +/** + * vdo_finish_operation() - Finish the current operation. + * @state The state whose operation is to be finished. + * @result The result of the operation. + * + * Will notify the operation waiter if there is one. This method + * should be used for operations started with vdo_start_operation(). + * For operations which were started with vdo_start_draining(), use + * vdo_finish_draining() instead. + * + * Return: true if there was an operation to finish. + */ +bool vdo_finish_operation(struct admin_state *state, int result) { if (!is_vdo_state_operating(state)) { return false; @@ -269,13 +257,13 @@ bool finish_vdo_operation(struct admin_state *state, int result) state->complete = state->starting; if (state->waiter != NULL) { - set_vdo_completion_result(state->waiter, result); + vdo_set_completion_result(state->waiter, result); } if (!state->starting) { - set_vdo_admin_state_code(state, state->next_state); + vdo_set_admin_state_code(state, state->next_state); if (state->waiter != NULL) { - complete_vdo_completion(UDS_FORGET(state->waiter)); + vdo_complete_completion(UDS_FORGET(state->waiter)); } } @@ -283,17 +271,16 @@ bool finish_vdo_operation(struct admin_state *state, int result) } /** - * Begin an operation if it may be started given the current state. + * begin_operation() - Begin an operation if it may be started given the + * current state. + * @state The admin_state. + * @operation The operation to begin. + * @waiter A completion to notify when the operation is complete; may be NULL. + * @initiator The vdo_admin_initiator to call if the operation may + * begin; may be NULL. * - * @param state The admin_state - * @param operation The operation to begin - * @param waiter A completion to notify when the operation is complete; may - * be NULL - * @param initiator The vdo_admin_initiator to call if the operation may - * begin; may be NULL - * - * @return VDO_SUCCESS or an error - **/ + * Return: VDO_SUCCESS or an error. + */ static int __must_check begin_operation(struct admin_state *state, const struct admin_state_code *operation, @@ -308,23 +295,23 @@ begin_operation(struct admin_state *state, result = uds_log_error_strerror(VDO_INVALID_ADMIN_STATE, "Can't start %s from %s", - get_vdo_admin_state_code_name(operation), - get_vdo_admin_state_name(state)); + operation->name, + vdo_get_admin_state_code(state)->name); } else if (state->waiter != NULL) { result = uds_log_error_strerror(VDO_COMPONENT_BUSY, "Can't start %s with extant waiter", - get_vdo_admin_state_code_name(operation)); + operation->name); } else { state->waiter = waiter; state->next_state = next_state; - set_vdo_admin_state_code(state, operation); + vdo_set_admin_state_code(state, operation); if (initiator != NULL) { state->starting = true; initiator(state); state->starting = false; if (state->complete) { - finish_vdo_operation(state, VDO_SUCCESS); + vdo_finish_operation(state, VDO_SUCCESS); } } @@ -332,23 +319,23 @@ begin_operation(struct admin_state *state, } if (waiter != NULL) { - finish_vdo_completion(waiter, result); + vdo_finish_completion(waiter, result); } return result; } /** - * Start an operation if it may be started given the current state. - * - * @param state The admin_state - * @param operation The operation to begin - * @param waiter A completion to notify when the operation is complete - * @param initiator The vdo_admin_initiator to call if the operation may - * begin; may be NULL + * start_operation() - Start an operation if it may be started given + * the current state. + * @state The admin_state. + * @operation The operation to begin. + * @waiter A completion to notify when the operation is complete. + * @initiator The vdo_admin_initiator to call if the operation may + * begin; may be NULL. * - * @return true if the operation was started - **/ + * Return: true if the operation was started. + */ static inline bool __must_check start_operation(struct admin_state *state, const struct admin_state_code *operation, @@ -360,16 +347,17 @@ start_operation(struct admin_state *state, } /** - * Check the result of a state validation. If the result failed, log an invalid - * state error and, if there is a waiter, notify it. + * check_code() - Check the result of a state validation. + * @valid true if the code is of an appropriate type. + * @code The code which failed to be of the correct type. + * @what What the code failed to be, for logging. + * @waiter The completion to notify of the error; may be NULL. * - * @param valid true if the code is of an appropriate type - * @param code The code which failed to be of the correct type - * @param what What the code failed to be, for logging - * @param waiter The completion to notify of the error; may be NULL + * If the result failed, log an invalid state error and, if there is a + * waiter, notify it. * - * @return The result of the check - **/ + * Return: The result of the check. + */ static bool check_code(bool valid, const struct admin_state_code *code, const char *what, @@ -382,25 +370,22 @@ static bool check_code(bool valid, } result = uds_log_error_strerror(VDO_INVALID_ADMIN_STATE, - "%s is not a %s", - get_vdo_admin_state_code_name(code), - what); + "%s is not a %s", code->name, what); if (waiter != NULL) { - finish_vdo_completion(waiter, result); + vdo_finish_completion(waiter, result); } return false; } /** - * Check that an operation is a drain. - * - * @param operation The operation to check - * @param waiter The completion to finish with an error if the operation is - * not a drain + * vdo_drain_operation() - Check that an operation is a drain. + * @operation The operation to check. + * @waiter The completion to finish with an error if the operation is + * not a drain. * - * @return true if the specified operation is a drain - **/ + * Return: true if the specified operation is a drain. + */ static bool __must_check assert_vdo_drain_operation(const struct admin_state_code *operation, struct vdo_completion *waiter) @@ -411,19 +396,31 @@ assert_vdo_drain_operation(const struct admin_state_code *operation, waiter); } -/**********************************************************************/ -bool start_vdo_draining(struct admin_state *state, +/** + * vdo_start_draining() - Initiate a drain operation if the current state + * permits it. + * @state The admin_state. + * @operation The type of drain to initiate. + * @waiter The completion to notify when the drain is complete (may be NULL). + * @initiator The vdo_admin_initiator to call if the operation may + * begin; may be NULL. + * + * Return: true if the drain was initiated, if not the waiter + * will be notified. + */ +bool vdo_start_draining(struct admin_state *state, const struct admin_state_code *operation, struct vdo_completion *waiter, vdo_admin_initiator *initiator) { - const struct admin_state_code *code = get_vdo_admin_state_code(state); + const struct admin_state_code *code = vdo_get_admin_state_code(state); + if (!assert_vdo_drain_operation(operation, waiter)) { return false; } if (code->quiescent) { - complete_vdo_completion(waiter); + vdo_complete_completion(waiter); return false; } @@ -432,28 +429,47 @@ bool start_vdo_draining(struct admin_state *state, "can't start %s from %s", operation->name, code->name); - finish_vdo_completion(waiter, VDO_INVALID_ADMIN_STATE); + vdo_finish_completion(waiter, VDO_INVALID_ADMIN_STATE); return false; } return start_operation(state, operation, waiter, initiator); } -/**********************************************************************/ -bool finish_vdo_draining(struct admin_state *state) +/** + * vdo_finish_draining() - Finish a drain operation if one was in progress. + * @state The admin_state to query. + * + * Return: true if the state was draining; will notify the waiter if so. + */ +bool vdo_finish_draining(struct admin_state *state) { - return finish_vdo_draining_with_result(state, VDO_SUCCESS); + return vdo_finish_draining_with_result(state, VDO_SUCCESS); } -/**********************************************************************/ -bool finish_vdo_draining_with_result(struct admin_state *state, int result) +/** + * vdo_finish_draining_with_result() - Finish a drain operation with a status + * code. + * @state The admin_state to query. + * @result The result of the drain operation. + * + * Return: true if the state was draining; will notify the waiter if so. + */ +bool vdo_finish_draining_with_result(struct admin_state *state, int result) { - return (is_vdo_state_draining(state) - && finish_vdo_operation(state, result)); + return (vdo_is_state_draining(state) + && vdo_finish_operation(state, result)); } -/**********************************************************************/ -bool assert_vdo_load_operation(const struct admin_state_code *operation, +/** + * vdo_assert_load_operation() - Check that an operation is a load. + * @operation The operation to check. + * @waiter The completion to finish with an error if the operation is + * not a load. + * + * Return: true if the specified operation is a load. + */ +bool vdo_assert_load_operation(const struct admin_state_code *operation, struct vdo_completion *waiter) { return check_code(operation->loading, @@ -462,38 +478,60 @@ bool assert_vdo_load_operation(const struct admin_state_code *operation, waiter); } -/**********************************************************************/ -bool start_vdo_loading(struct admin_state *state, +/** + * vdo_start_loading() - Initiate a load operation if the current state + * permits it. + * @state The admin_state. + * @operation The type of load to initiate. + * @waiter The completion to notify when the load is complete (may be NULL). + * @initiator The vdo_admin_initiator to call if the operation may + * begin; may be NULL. + * + * Return: true if the load was initiated, if not the waiter will be notified. + */ +bool vdo_start_loading(struct admin_state *state, const struct admin_state_code *operation, struct vdo_completion *waiter, vdo_admin_initiator *initiator) { - return (assert_vdo_load_operation(operation, waiter) && + return (vdo_assert_load_operation(operation, waiter) && start_operation(state, operation, waiter, initiator)); } -/**********************************************************************/ -bool finish_vdo_loading(struct admin_state *state) +/** + * vdo_finish_loading() - Finish a load operation if one was in progress. + * @state The admin_state to query. + * + * Return: true if the state was loading; will notify the waiter if so. + */ +bool vdo_finish_loading(struct admin_state *state) { - return finish_vdo_loading_with_result(state, VDO_SUCCESS); + return vdo_finish_loading_with_result(state, VDO_SUCCESS); } -/**********************************************************************/ -bool finish_vdo_loading_with_result(struct admin_state *state, int result) +/** + * vdo_finish_loading_with_result() - Finish a load operation with a status + * code. + * @state The admin_state to query. + * @result The result of the load operation. + * + * Return: true if the state was loading; will notify the waiter if so. + */ +bool vdo_finish_loading_with_result(struct admin_state *state, int result) { - return (is_vdo_state_loading(state) - && finish_vdo_operation(state, result)); + return (vdo_is_state_loading(state) + && vdo_finish_operation(state, result)); } /** - * Check whether an admin_state_code is a resume operation. - * - * @param operation The operation to check - * @param waiter The completion to notify if the operation is not a resume - * operation; may be NULL + * assert_vdo_resume_operation() - Check whether an admin_state_code is a + * resume operation. + * @operation The operation to check. + * @waiter The completion to notify if the operation is not a resume + * operation; may be NULL. * - * @return true if the code is a resume operation - **/ + * Return: true if the code is a resume operation. + */ static bool __must_check assert_vdo_resume_operation(const struct admin_state_code *operation, struct vdo_completion *waiter) @@ -504,8 +542,19 @@ assert_vdo_resume_operation(const struct admin_state_code *operation, waiter); } -/**********************************************************************/ -bool start_vdo_resuming(struct admin_state *state, +/** + * vdo_start_resuming() - Initiate a resume operation if the current state + * permits it. + * @state The admin_state. + * @operation The type of resume to start. + * @waiter The completion to notify when the resume is complete (may be NULL). + * @initiator The vdo_admin_initiator to call if the operation may + * begin; may be NULL. + * + * Return: true if the resume was initiated, if not the waiter will be + * notified. + */ +bool vdo_start_resuming(struct admin_state *state, const struct admin_state_code *operation, struct vdo_completion *waiter, vdo_admin_initiator *initiator) @@ -514,47 +563,71 @@ bool start_vdo_resuming(struct admin_state *state, start_operation(state, operation, waiter, initiator)); } -/**********************************************************************/ -bool finish_vdo_resuming(struct admin_state *state) +/** + * vdo_finish_resuming() - Finish a resume operation if one was in progress. + * @state The admin_state to query. + * + * Return: true if the state was resuming; will notify the waiter if so. + */ +bool vdo_finish_resuming(struct admin_state *state) { - return finish_vdo_resuming_with_result(state, VDO_SUCCESS); + return vdo_finish_resuming_with_result(state, VDO_SUCCESS); } -/**********************************************************************/ -bool finish_vdo_resuming_with_result(struct admin_state *state, int result) +/** + * vdo_finish_resuming_with_result() - Finish a resume operation with a + * status code. + * @state The admin_state to query. + * @result The result of the resume operation. + * + * Return: true if the state was resuming; will notify the waiter if so. + */ +bool vdo_finish_resuming_with_result(struct admin_state *state, int result) { - return (is_vdo_state_resuming(state) - && finish_vdo_operation(state, result)); + return (vdo_is_state_resuming(state) + && vdo_finish_operation(state, result)); } -/**********************************************************************/ -int resume_vdo_if_quiescent(struct admin_state *state) +/** + * vdo_resume_if_quiescent() - Change the state to normal operation if the + * current state is quiescent. + * @state The admin_state to resume. + * + * Return: VDO_SUCCESS if the state resumed, VDO_INVALID_ADMIN_STATE otherwise. + */ +int vdo_resume_if_quiescent(struct admin_state *state) { - if (!is_vdo_state_quiescent(state)) { + if (!vdo_is_state_quiescent(state)) { return VDO_INVALID_ADMIN_STATE; } - set_vdo_admin_state_code(state, VDO_ADMIN_STATE_NORMAL_OPERATION); + vdo_set_admin_state_code(state, VDO_ADMIN_STATE_NORMAL_OPERATION); return VDO_SUCCESS; } /** - * Check whether an admin_state_code is an operation. - * - * @param code The operation to check - * @param waiter The completion to notify if the code is not an operation; may - * be NULL + * assert_operation() - Check whether an admin_state_code is an operation. + * @code The operation to check. + * @waiter The completion to notify if the code is not an operation; may + * be NULL. * - * @return true if the code is an operation - **/ + * Return: true if the code is an operation. + */ static bool assert_operation(const struct admin_state_code *code, struct vdo_completion *waiter) { return check_code(code->operating, code, "operation", waiter); } -/**********************************************************************/ -int start_vdo_operation(struct admin_state *state, +/** + * vdo_start_operation() - Attempt to start an operation. + * @state the admin_state. + * @operation the operation to start. + * + * Return: VDO_SUCCESS if the operation was started, VDO_INVALID_ADMIN_STATE + * if not + */ +int vdo_start_operation(struct admin_state *state, const struct admin_state_code *operation) { return (assert_operation(operation, NULL) ? @@ -562,8 +635,18 @@ int start_vdo_operation(struct admin_state *state, VDO_INVALID_ADMIN_STATE); } -/**********************************************************************/ -bool start_vdo_operation_with_waiter(struct admin_state *state, +/** + * vdo_start_operation_with_waiter() - Attempt to start an operation. + * @state the admin_state. + * @operation the operation to start. + * @waiter the completion to notify when the operation completes or + * fails to start; may be NULL. + * @initiator The vdo_admin_initiator to call if the operation may + * begin; may be NULL. + * + * Return: true if the operation was started. + */ +bool vdo_start_operation_with_waiter(struct admin_state *state, const struct admin_state_code *operation, struct vdo_completion *waiter, vdo_admin_initiator *initiator) @@ -572,4 +655,3 @@ bool start_vdo_operation_with_waiter(struct admin_state *state, (begin_operation(state, operation, waiter, initiator) == VDO_SUCCESS)); } - diff --git a/vdo/admin-state.h b/vdo/admin-state.h new file mode 100644 index 00000000..88ed70bc --- /dev/null +++ b/vdo/admin-state.h @@ -0,0 +1,287 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright Red Hat + */ + +#ifndef ADMIN_STATE_H +#define ADMIN_STATE_H + +#include "completion.h" +#include "types.h" + +struct admin_state_code { + const char *name; + /* Normal operation, data_vios may be active */ + bool normal; + /* I/O is draining, new requests should not start */ + bool draining; + /* This is a startup time operation */ + bool loading; + /* The next state will be quiescent */ + bool quiescing; + /* The VDO is quiescent, there should be no I/O */ + bool quiescent; + /* + * Whether an operation is in progress and so no other operation may be + * started + */ + bool operating; +}; + +/* + * The state codes. + */ +extern const struct admin_state_code *VDO_ADMIN_STATE_NORMAL_OPERATION; +extern const struct admin_state_code *VDO_ADMIN_STATE_OPERATING; +extern const struct admin_state_code *VDO_ADMIN_STATE_FORMATTING; +extern const struct admin_state_code *VDO_ADMIN_STATE_PRE_LOADING; +extern const struct admin_state_code *VDO_ADMIN_STATE_PRE_LOADED; +extern const struct admin_state_code *VDO_ADMIN_STATE_LOADING; +extern const struct admin_state_code *VDO_ADMIN_STATE_LOADING_FOR_RECOVERY; +extern const struct admin_state_code *VDO_ADMIN_STATE_LOADING_FOR_REBUILD; +extern const struct admin_state_code *VDO_ADMIN_STATE_WAITING_FOR_RECOVERY; +extern const struct admin_state_code *VDO_ADMIN_STATE_NEW; +extern const struct admin_state_code *VDO_ADMIN_STATE_INITIALIZED; +extern const struct admin_state_code *VDO_ADMIN_STATE_RECOVERING; +extern const struct admin_state_code *VDO_ADMIN_STATE_REBUILDING; +extern const struct admin_state_code *VDO_ADMIN_STATE_SAVING; +extern const struct admin_state_code *VDO_ADMIN_STATE_SAVED; +extern const struct admin_state_code *VDO_ADMIN_STATE_SCRUBBING; +extern const struct admin_state_code *VDO_ADMIN_STATE_SAVE_FOR_SCRUBBING; +extern const struct admin_state_code *VDO_ADMIN_STATE_STOPPING; +extern const struct admin_state_code *VDO_ADMIN_STATE_STOPPED; +extern const struct admin_state_code *VDO_ADMIN_STATE_SUSPENDING; +extern const struct admin_state_code *VDO_ADMIN_STATE_SUSPENDED; +extern const struct admin_state_code *VDO_ADMIN_STATE_SUSPENDED_OPERATION; +extern const struct admin_state_code *VDO_ADMIN_STATE_RESUMING; + +struct admin_state { + /* The current administrative state */ + const struct admin_state_code *current_state; + /* + * The next administrative state (when the current operation finishes) + */ + const struct admin_state_code *next_state; + /* A completion waiting on a state change */ + struct vdo_completion *waiter; + /* Whether an operation is being initiated */ + bool starting; + /* Whether an operation has completed in the initiator */ + bool complete; +}; + +/** + * typedef vdo_admin_initiator - A method to be called once an admin operation + * may be initiated. + */ +typedef void vdo_admin_initiator(struct admin_state *state); + +/** + * vdo_get_admin_state_code() - Get the current admin state code. + * @state: The admin_state to query. + * + * Return: The current state. + */ +static inline const struct admin_state_code * __must_check +vdo_get_admin_state_code(const struct admin_state *state) +{ + return READ_ONCE(state->current_state); +} + +/** + * vdo_set_admin_state_code() - Set the current admin state code. + * @state: The admin_state to modify. + * @code: The code to set. + * + * This function should be used primarily for initialization and by adminState + * internals. Most uses should go through the operation interfaces. + */ +static inline void +vdo_set_admin_state_code(struct admin_state *state, + const struct admin_state_code *code) +{ + WRITE_ONCE(state->current_state, code); +} + +/** + * vdo_is_state_normal() - Check whether an admin_state is in normal + * operation. + * @state: The admin_state to query. + * + * Return: true if the state is normal. + */ +static inline bool __must_check +vdo_is_state_normal(const struct admin_state *state) +{ + return vdo_get_admin_state_code(state)->normal; +} + +/** + * vdo_is_state_suspending() - Check whether an admin_state is suspending. + * @state: The admin_state to query. + * + * Return: true if the state is suspending. + */ +static inline bool __must_check +vdo_is_state_suspending(const struct admin_state *state) +{ + return (vdo_get_admin_state_code(state) == VDO_ADMIN_STATE_SUSPENDING); +} + +/** + * vdo_is_state_saving() - Check whether an admin_state is saving. + * @state: The admin_state to query. + * + * Return: true if the state is saving. + */ +static inline bool __must_check +vdo_is_state_saving(const struct admin_state *state) +{ + return (vdo_get_admin_state_code(state) == VDO_ADMIN_STATE_SAVING); +} + +/** + * vdo_is_state_saved() - Check whether an admin_state is saved. + * @state: The admin_state to query. + * + * Return: true if the state is saved. + */ +static inline bool __must_check +vdo_is_state_saved(const struct admin_state *state) +{ + return (vdo_get_admin_state_code(state) == VDO_ADMIN_STATE_SAVED); +} + +/** + * vdo_is_state_draining() - Check whether an admin_state is draining. + * @state: The admin_state to query. + * + * Return: true if the state is draining. + */ +static inline bool __must_check +vdo_is_state_draining(const struct admin_state *state) +{ + return vdo_get_admin_state_code(state)->draining; +} + +/** + * vdo_is_state_loading() - Check whether an admin_state is loading. + * @state: The admin_state to query. + * + * Return: true if the state is loading. + */ +static inline bool __must_check +vdo_is_state_loading(const struct admin_state *state) +{ + return vdo_get_admin_state_code(state)->loading; +} + +/** + * vdo_is_state_resuming() - Check whether an admin_state is resuming. + * @state: The admin_state to query. + * + * Return: true if the state is resuming. + */ +static inline bool __must_check +vdo_is_state_resuming(const struct admin_state *state) +{ + return (vdo_get_admin_state_code(state) == VDO_ADMIN_STATE_RESUMING); +} + +/** + * vdo_is_state_clean_load() - Check whether an admin_state is doing a clean + * load. + * @state: The admin_state to query. + * + * Return: true if the state is a clean load. + */ +static inline bool __must_check +vdo_is_state_clean_load(const struct admin_state *state) +{ + const struct admin_state_code *code = vdo_get_admin_state_code(state); + + return ((code == VDO_ADMIN_STATE_FORMATTING) || + (code == VDO_ADMIN_STATE_LOADING)); +} + +/** + * vdo_is_state_quiescing() - Check whether an admin_state is quiescing. + * @state: The admin_state to check. + * + * Return: true if the state is quiescing. + */ +static inline bool __must_check +vdo_is_state_quiescing(const struct admin_state *state) +{ + return vdo_get_admin_state_code(state)->quiescing; +} + +/** + * vdo_is_state_quiescent() - Check whether an admin_state is quiescent. + * @state: The admin_state to query. + * + * Return: true is the state is quiescent. + */ +static inline bool __must_check +vdo_is_state_quiescent(const struct admin_state *state) +{ + return vdo_get_admin_state_code(state)->quiescent; +} + +bool vdo_start_draining(struct admin_state *state, + const struct admin_state_code *operation, + struct vdo_completion *waiter, + vdo_admin_initiator * initiator); + +bool vdo_finish_draining(struct admin_state *state); + +bool vdo_finish_draining_with_result(struct admin_state *state, int result); + +bool __must_check +vdo_assert_load_operation(const struct admin_state_code *operation, + struct vdo_completion *waiter); + +bool vdo_start_loading(struct admin_state *state, + const struct admin_state_code *operation, + struct vdo_completion *waiter, + vdo_admin_initiator *initiator); + +bool vdo_finish_loading(struct admin_state *state); + +bool vdo_finish_loading_with_result(struct admin_state *state, int result); + +bool vdo_start_resuming(struct admin_state *state, + const struct admin_state_code *operation, + struct vdo_completion *waiter, + vdo_admin_initiator *initiator); + +bool vdo_finish_resuming(struct admin_state *state); + +bool vdo_finish_resuming_with_result(struct admin_state *state, int result); + +int vdo_resume_if_quiescent(struct admin_state *state); + +int vdo_start_operation(struct admin_state *state, + const struct admin_state_code *operation); + +bool vdo_start_operation_with_waiter(struct admin_state *state, + const struct admin_state_code *operation, + struct vdo_completion *waiter, + vdo_admin_initiator *initiator); + +bool vdo_finish_operation(struct admin_state *state, int result); + +/** + * vdo_set_operation_result() - Set a result for the current operation. + * @state: the admin_state. + * @result: the result to set; if there is no waiter, this is a no-op. + */ +static inline void vdo_set_operation_result(struct admin_state *state, + int result) +{ + if (state->waiter != NULL) { + vdo_set_completion_result(state->waiter, result); + } +} + +#endif /* ADMIN_STATE_H */ diff --git a/vdo/adminCompletion.h b/vdo/adminCompletion.h deleted file mode 100644 index d42c739a..00000000 --- a/vdo/adminCompletion.h +++ /dev/null @@ -1,184 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/adminCompletion.h#14 $ - */ - -#ifndef ADMIN_COMPLETION_H -#define ADMIN_COMPLETION_H - -#include -#include - -#include "uds-threads.h" - -#include "completion.h" -#include "types.h" - -enum admin_operation_type { - VDO_ADMIN_OPERATION_UNKNOWN = 0, - VDO_ADMIN_OPERATION_GROW_LOGICAL, - VDO_ADMIN_OPERATION_GROW_PHYSICAL, - VDO_ADMIN_OPERATION_PREPARE_GROW_PHYSICAL, - VDO_ADMIN_OPERATION_LOAD, - VDO_ADMIN_OPERATION_RESUME, - VDO_ADMIN_OPERATION_SUSPEND, -}; - -struct admin_completion; - -/** - * A function which gets the ID of the thread on which the current phase of an - * admin operation should be run. - * - * @param admin_completion The admin_completion - * - * @return The ID of the thread on which the current phase should be performed - **/ -typedef thread_id_t -vdo_thread_id_getter_for_phase(struct admin_completion *admin_completion); - -struct admin_completion { - // XXX should be replaced by container_of() when enqueuables go away - // and this becomes a field of struct vdo. - struct vdo *vdo; - /** The completion */ - struct vdo_completion completion; - /** The sub-task completion */ - struct vdo_completion sub_task_completion; - /** Whether this completion is in use */ - atomic_t busy; - /** The operation type */ - enum admin_operation_type type; - /** Method to get the thread id for the current phase */ - vdo_thread_id_getter_for_phase *get_thread_id; - /** The current phase of the operation */ - uint32_t phase; - /** The struct completion for waiting on the operation */ - struct completion callback_sync; -}; - -/** - * Check that an admin_completion's type is as expected. - * - * @param completion The admin_completion to check - * @param expected The expected type - **/ -void assert_vdo_admin_operation_type(struct admin_completion *completion, - enum admin_operation_type expected); - -/** - * Convert the sub-task completion of an admin_completion to an - * admin_completion. - * - * @param completion the admin_completion's sub-task completion - * - * @return The sub-task completion as its enclosing admin_completion - **/ -struct admin_completion * __must_check -vdo_admin_completion_from_sub_task(struct vdo_completion *completion); - -/** - * Assert that we are operating on the correct thread for the current phase. - * - * @param admin_completion The admin_completion to check - * @param what The method doing the phase check - * @param phase_names The names of the phases of the current operation - **/ -void assert_vdo_admin_phase_thread(struct admin_completion *admin_completion, - const char *what, - const char *phase_names[]); - -/** - * Get the vdo from the sub-task completion of its admin_completion. - * - * @param completion the sub-task completion - * @param expected the expected operation type of the admin_completion - * - * @return The vdo - **/ -struct vdo * __must_check -vdo_from_admin_sub_task(struct vdo_completion *completion, - enum admin_operation_type expected); - -/** - * Initialize an admin completion. - * - * @param vdo The vdo which owns the completion - * @param admin_completion The admin_completion to initialize - **/ -void initialize_vdo_admin_completion(struct vdo *vdo, - struct admin_completion *admin_completion); - -/** - * Reset an admin_completion's sub-task completion. - * - * @param completion The admin_completion's sub-task completion - * - * @return The sub-task completion for the convenience of callers - **/ -struct vdo_completion *reset_vdo_admin_sub_task(struct vdo_completion *completion); - -/** - * Prepare the sub-task completion of a vdo's admin_completion - * - * @param vdo The vdo - * @param callback The callback for the sub-task - * @param error_handler The error handler for the sub-task - * @param thread_id The ID of the thread on which to run the callback - **/ -void prepare_vdo_admin_sub_task_on_thread(struct vdo *vdo, - vdo_action *callback, - vdo_action *error_handler, - thread_id_t thread_id); - -/** - * Prepare the sub-task completion of a vdo's admin_completion to run on the - * same thread as the admin_completion's main completion. - * - * @param vdo The vdo - * @param callback The callback for the sub-task - * @param error_handler The error handler for the sub-task - **/ -void prepare_vdo_admin_sub_task(struct vdo *vdo, - vdo_action *callback, - vdo_action *error_handler); - -/** - * Perform an administrative operation (load, suspend, grow logical, or grow - * physical). This method should not be called from base threads unless it is - * certain the calling thread won't be needed to perform the operation. It may - * (and should) be called from non-base threads. - * - * @param vdo The vdo on which to perform the operation - * @param type The type of operation to perform - * @param thread_id_getter A function for getting the ID of the thread on - * which a given phase should be run - * @param action The action which starts the operation - * @param error_handler The error handler for the operation - * - * @return The result of the operation - **/ -int __must_check -perform_vdo_admin_operation(struct vdo *vdo, - enum admin_operation_type type, - vdo_thread_id_getter_for_phase *thread_id_getter, - vdo_action *action, - vdo_action *error_handler); - -#endif /* ADMIN_COMPLETION_H */ diff --git a/vdo/adminState.h b/vdo/adminState.h deleted file mode 100644 index c4f17a62..00000000 --- a/vdo/adminState.h +++ /dev/null @@ -1,467 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/adminState.h#20 $ - */ - -#ifndef ADMIN_STATE_H -#define ADMIN_STATE_H - -#include "completion.h" -#include "types.h" - -struct admin_state_code { - const char *name; - /** Normal operation, data_vios may be active */ - bool normal; - /** I/O is draining, new requests should not start */ - bool draining; - /** This is a startup time operation */ - bool loading; - /** The next state will be quiescent */ - bool quiescing; - /** The VDO is quiescent, there should be no I/O */ - bool quiescent; - /** - * Whether an operation is in progress and so no other operation may be - * started - */ - bool operating; -}; - -/** - * The state codes. - **/ -extern const struct admin_state_code *VDO_ADMIN_STATE_NORMAL_OPERATION; -extern const struct admin_state_code *VDO_ADMIN_STATE_OPERATING; -extern const struct admin_state_code *VDO_ADMIN_STATE_FORMATTING; -extern const struct admin_state_code *VDO_ADMIN_STATE_PRE_LOADING; -extern const struct admin_state_code *VDO_ADMIN_STATE_PRE_LOADED; -extern const struct admin_state_code *VDO_ADMIN_STATE_LOADING; -extern const struct admin_state_code *VDO_ADMIN_STATE_LOADING_FOR_RECOVERY; -extern const struct admin_state_code *VDO_ADMIN_STATE_LOADING_FOR_REBUILD; -extern const struct admin_state_code *VDO_ADMIN_STATE_WAITING_FOR_RECOVERY; -extern const struct admin_state_code *VDO_ADMIN_STATE_NEW; -extern const struct admin_state_code *VDO_ADMIN_STATE_INITIALIZED; -extern const struct admin_state_code *VDO_ADMIN_STATE_RECOVERING; -extern const struct admin_state_code *VDO_ADMIN_STATE_REBUILDING; -extern const struct admin_state_code *VDO_ADMIN_STATE_SAVING; -extern const struct admin_state_code *VDO_ADMIN_STATE_SAVED; -extern const struct admin_state_code *VDO_ADMIN_STATE_SCRUBBING; -extern const struct admin_state_code *VDO_ADMIN_STATE_SAVE_FOR_SCRUBBING; -extern const struct admin_state_code *VDO_ADMIN_STATE_SUSPENDING; -extern const struct admin_state_code *VDO_ADMIN_STATE_SUSPENDED; -extern const struct admin_state_code *VDO_ADMIN_STATE_SUSPENDED_OPERATION; -extern const struct admin_state_code *VDO_ADMIN_STATE_RESUMING; - -struct admin_state { - /** The current administrative state */ - const struct admin_state_code *current_state; - /** - * The next administrative state (when the current operation finishes) - */ - const struct admin_state_code *next_state; - /** A completion waiting on a state change */ - struct vdo_completion *waiter; - /** Whether an operation is being initiated */ - bool starting; - /** Whether an operation has completed in the initiator */ - bool complete; -}; - -/** - * A method to be called once an admin operation may be initiated. - **/ -typedef void vdo_admin_initiator(struct admin_state *state); - -/** - * Get the name of an admin_state_code for logging purposes. - * - * @param code The admin_state_code - * - * @return The name of the state's code - **/ -const char * __must_check -get_vdo_admin_state_code_name(const struct admin_state_code *code); - -/** - * Get the name of an admin_state's code for logging purposes. - * - * @param state The admin_state - * - * @return The name of the state's code - **/ -const char * __must_check -get_vdo_admin_state_name(const struct admin_state *state); - -/** - * Get the current admin state code. - * - * @param state The admin_state to query - * - * @return The current state - **/ -static inline const struct admin_state_code * __must_check -get_vdo_admin_state_code(const struct admin_state *state) -{ - return READ_ONCE(state->current_state); -} - -/** - * Set the current admin state code. This function should be used primarily for - * initialization and by adminState internals. Most uses should go through the - * operation interfaces. - * - * @param state The admin_state to modify - * @param code The code to set - **/ -static inline void -set_vdo_admin_state_code(struct admin_state *state, - const struct admin_state_code *code) -{ - WRITE_ONCE(state->current_state, code); -} - -/** - * Check whether an admin_state is in normal operation. - * - * @param state The admin_state to query - * - * @return true if the state is normal - **/ -static inline bool __must_check -is_vdo_state_normal(const struct admin_state *state) -{ - return get_vdo_admin_state_code(state)->normal; -} - -/** - * Check whether an admin_state is suspending. - * - * @param state The admin_state to query - * - * @return true if the state is suspending - **/ -static inline bool __must_check -is_vdo_state_suspending(const struct admin_state *state) -{ - return (get_vdo_admin_state_code(state) == VDO_ADMIN_STATE_SUSPENDING); -} - -/** - * Check whether an admin_state is saving. - * - * @param state The admin_state to query - * - * @return true if the state is saving - **/ -static inline bool __must_check -is_vdo_state_saving(const struct admin_state *state) -{ - return (get_vdo_admin_state_code(state) == VDO_ADMIN_STATE_SAVING); -} - -/** - * Check whether an admin_state is saved. - * - * @param state The admin_state to query - * - * @return true if the state is saved - **/ -static inline bool __must_check -is_vdo_state_saved(const struct admin_state *state) -{ - return (get_vdo_admin_state_code(state) == VDO_ADMIN_STATE_SAVED); -} - -/** - * Check whether an admin_state is draining. - * - * @param state The admin_state to query - * - * @return true if the state is draining - **/ -static inline bool __must_check -is_vdo_state_draining(const struct admin_state *state) -{ - return get_vdo_admin_state_code(state)->draining; -} - -/** - * Check whether an admin_state is loading. - * - * @param state The admin_state to query - * - * @return true if the state is loading - **/ -static inline bool __must_check -is_vdo_state_loading(const struct admin_state *state) -{ - return get_vdo_admin_state_code(state)->loading; -} - -/** - * Check whether an admin_state is resumeing. - * - * @param state The admin_state to query - * - * @return true if the state is resumeing - **/ -static inline bool __must_check -is_vdo_state_resuming(const struct admin_state *state) -{ - return (get_vdo_admin_state_code(state) == VDO_ADMIN_STATE_RESUMING); -} - -/** - * Check whether an admin_state is doing a clean load. - * - * @param state The admin_state to query - * - * @return true if the state is a clean load - **/ -static inline bool __must_check -is_vdo_state_clean_load(const struct admin_state *state) -{ - const struct admin_state_code *code = get_vdo_admin_state_code(state); - return ((code == VDO_ADMIN_STATE_FORMATTING) || - (code == VDO_ADMIN_STATE_LOADING)); -} - -/** - * Check whether an admin_state is quiescing. - * - * @param state The admin_state to check - * - * @return true if the state is quiescing - **/ -static inline bool __must_check -is_vdo_state_quiescing(const struct admin_state *state) -{ - return get_vdo_admin_state_code(state)->quiescing; -} - -/** - * Check whether an admin_state is quiescent. - * - * @param state The admin_state to query - * - * @return true is the state is quiescent - **/ -static inline bool __must_check -is_vdo_state_quiescent(const struct admin_state *state) -{ - return get_vdo_admin_state_code(state)->quiescent; -} - -/** - * Initiate a drain operation if the current state permits it. - * - * @param state The admin_state - * @param operation The type of drain to initiate - * @param waiter The completion to notify when the drain is complete (may - * be NULL) - * @param initiator The vdo_admin_initiator to call if the operation may - * begin; may be NULL - * - * @return true if the drain was initiated, if not the waiter - * will be notified - **/ -bool start_vdo_draining(struct admin_state *state, - const struct admin_state_code *operation, - struct vdo_completion *waiter, - vdo_admin_initiator *initiator); - -/** - * Finish a drain operation if one was in progress. - * - * @param state The admin_state to query - * - * @return true if the state was draining; will notify the waiter - * if so - **/ -bool finish_vdo_draining(struct admin_state *state); - -/** - * Finish a drain operation with a status code. - * - * @param state The admin_state to query - * @param result The result of the drain operation - * - * @return true if the state was draining; will notify the - * waiter if so - **/ -bool finish_vdo_draining_with_result(struct admin_state *state, int result); - -/** - * Check that an operation is a load. - * - * @param operation The operation to check - * @param waiter The completion to finish with an error if the operation is - * not a load - * - * @return true if the specified operation is a load - **/ -bool __must_check -assert_vdo_load_operation(const struct admin_state_code *operation, - struct vdo_completion *waiter); - -/** - * Initiate a load operation if the current state permits it. - * - * @param state The admin_state - * @param operation The type of load to initiate - * @param waiter The completion to notify when the load is complete (may be - * NULL) - * @param initiator The vdo_admin_initiator to call if the operation may - * begin; may be NULL - * - * @return true if the load was initiated, if not the waiter - * will be notified - **/ -bool start_vdo_loading(struct admin_state *state, - const struct admin_state_code *operation, - struct vdo_completion *waiter, - vdo_admin_initiator *initiator); - -/** - * Finish a load operation if one was in progress. - * - * @param state The admin_state to query - * - * @return true if the state was loading; will notify the waiter - * if so - **/ -bool finish_vdo_loading(struct admin_state *state); - -/** - * Finish a load operation with a status code. - * - * @param state The admin_state to query - * @param result The result of the load operation - * - * @return true if the state was loading; will notify the - * waiter if so - **/ -bool finish_vdo_loading_with_result(struct admin_state *state, int result); - -/** - * Initiate a resume operation if the current state permits it. - * - * @param state The admin_state - * @param operation The type of resume to start - * @param waiter The completion to notify when the resume is complete (may - * be NULL) - * @param initiator The vdo_admin_initiator to call if the operation may - * begin; may be NULL - * - * @return true if the resume was initiated, if not the waiter - * will be notified - **/ -bool start_vdo_resuming(struct admin_state *state, - const struct admin_state_code *operation, - struct vdo_completion *waiter, - vdo_admin_initiator *initiator); - -/** - * Finish a resume operation if one was in progress. - * - * @param state The admin_state to query - * - * @return true if the state was resuming; will notify the waiter - * if so - **/ -bool finish_vdo_resuming(struct admin_state *state); - -/** - * Finish a resume operation with a status code. - * - * @param state The admin_state to query - * @param result The result of the resume operation - * - * @return true if the state was resuming; will notify the - * waiter if so - **/ -bool finish_vdo_resuming_with_result(struct admin_state *state, int result); - -/** - * Change the state to normal operation if the current state is quiescent. - * - * @param state The admin_state to resume - * - * @return VDO_SUCCESS if the state resumed, VDO_INVALID_ADMIN_STATE otherwise - **/ -int resume_vdo_if_quiescent(struct admin_state *state); - -/** - * Attempt to start an operation. - * - * @param state the admin_state - * @param operation the operation to start - * - * @return VDO_SUCCESS if the operation was started - * VDO_INVALID_ADMIN_STATE if not - **/ -int start_vdo_operation(struct admin_state *state, - const struct admin_state_code *operation); - -/** - * Attempt to start an operation. - * - * @param state the admin_state - * @param operation the operation to start - * @param waiter the completion to notify when the operation completes or - * fails to start; may be NULL - * @param initiator The vdo_admin_initiator to call if the operation may - * begin; may be NULL - * - * @return true if the operation was started - **/ -bool start_vdo_operation_with_waiter(struct admin_state *state, - const struct admin_state_code *operation, - struct vdo_completion *waiter, - vdo_admin_initiator *initiator); - -/** - * Finish the current operation. Will notify the operation waiter if there is - * one. This method should be used for operations started with - * start_vdo_operation(). For operations which were started with - * start_vdo_draining(), use finish_vdo_draining() instead. - * - * @param state The state whose operation is to be finished - * @param result The result of the operation - * - * @return true if there was an operation to finish - **/ -bool finish_vdo_operation(struct admin_state *state, int result); - -/** - * Set a result for the current operation. - * - * @param state the admin_state - * @param result the result to set; if there is no waiter, this is a no-op - **/ -static inline void set_vdo_operation_result(struct admin_state *state, - int result) -{ - if (state->waiter != NULL) { - set_vdo_completion_result(state->waiter, result); - } -} - -#endif // ADMIN_STATE_H diff --git a/vdo/allocatingVIO.c b/vdo/allocatingVIO.c deleted file mode 100644 index 2a4aacde..00000000 --- a/vdo/allocatingVIO.c +++ /dev/null @@ -1,310 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/allocatingVIO.c#34 $ - */ - -#include "allocatingVIO.h" - -#include "logger.h" -#include "memoryAlloc.h" -#include "permassert.h" - -#include "allocationSelector.h" -#include "blockAllocator.h" -#include "dataVIO.h" -#include "pbnLock.h" -#include "slabDepot.h" -#include "types.h" -#include "vdoInternal.h" -#include "vioWrite.h" - -/** - * Make a single attempt to acquire a write lock on a newly-allocated PBN. - * - * @param allocating_vio The allocating_vio that wants a write lock for its - * newly allocated block - * - * @return VDO_SUCCESS or an error code - **/ -static int attempt_pbn_write_lock(struct allocating_vio *allocating_vio) -{ - struct pbn_lock *lock; - int result; - - assert_vio_in_physical_zone(allocating_vio); - - ASSERT_LOG_ONLY(allocating_vio->allocation_lock == NULL, - "must not acquire a lock while already referencing one"); - - result = attempt_vdo_physical_zone_pbn_lock(allocating_vio->zone, - allocating_vio->allocation, - allocating_vio->write_lock_type, - &lock); - if (result != VDO_SUCCESS) { - return result; - } - - if (lock->holder_count > 0) { - // This block is already locked, which should be impossible. - return uds_log_error_strerror(VDO_LOCK_ERROR, - "Newly allocated block %llu was spuriously locked (holder_count=%u)", - (unsigned long long) allocating_vio->allocation, - lock->holder_count); - } - - // We've successfully acquired a new lock, so mark it as ours. - lock->holder_count += 1; - allocating_vio->allocation_lock = lock; - assign_vdo_pbn_lock_provisional_reference(lock); - return VDO_SUCCESS; -} - -/** - * Attempt to allocate and lock a physical block. If successful, continue - * along the write path. - * - * @param allocating_vio The allocating_vio which needs an allocation - * - * @return VDO_SUCCESS or an error if a block could not be allocated - **/ -static int allocate_and_lock_block(struct allocating_vio *allocating_vio) -{ - struct vio *vio = allocating_vio_as_vio(allocating_vio); - struct block_allocator *allocator = - get_vdo_physical_zone_block_allocator(allocating_vio->zone); - int result = allocate_vdo_block(allocator, &allocating_vio->allocation); - if (result != VDO_SUCCESS) { - return result; - } - - result = attempt_pbn_write_lock(allocating_vio); - if (result != VDO_SUCCESS) { - return result; - } - - // We got a block! - vio->physical = allocating_vio->allocation; - allocating_vio->allocation_callback(allocating_vio); - return VDO_SUCCESS; -} - -static void allocate_block_for_write(struct vdo_completion *completion); - -/** - * Retry allocating a block for write. - * - * @param waiter The allocating_vio that was waiting to allocate - * @param context The context (unused) - **/ -static void -retry_allocate_block_for_write(struct waiter *waiter, - void *context __always_unused) -{ - struct allocating_vio *allocating_vio = waiter_as_allocating_vio(waiter); - allocate_block_for_write(allocating_vio_as_completion(allocating_vio)); -} - -/** - * Attempt to enqueue an allocating_vio to wait for a slab to be scrubbed in the - * current allocation zone. - * - * @param allocating_vio The struct allocating_vio which wants to allocate a - * block - * - * @return VDO_SUCCESS if the allocating_vio was queued, VDO_NO_SPACE if there - * are no slabs to be scrubbed in the current zone, or some other - * error - **/ -static int wait_for_clean_slab(struct allocating_vio *allocating_vio) -{ - int result; - struct block_allocator *allocator = - get_vdo_physical_zone_block_allocator(allocating_vio->zone); - struct waiter *waiter = allocating_vio_as_waiter(allocating_vio); - - waiter->callback = retry_allocate_block_for_write; - - result = enqueue_for_clean_vdo_slab(allocator, waiter); - if (result != VDO_SUCCESS) { - return result; - } - - // We've successfully enqueued, when we come back, pretend like we've - // never tried this allocation before. - allocating_vio->wait_for_clean_slab = false; - allocating_vio->allocation_attempts = 0; - return VDO_SUCCESS; -} - -/** - * Attempt to allocate a block in an allocating_vio's current allocation zone. - * - * @param allocating_vio The allocating_vio - * - * @return VDO_SUCCESS or an error - **/ -static int allocate_block_in_zone(struct allocating_vio *allocating_vio) -{ - zone_count_t zone_number; - int result; - struct vdo *vdo = get_vdo_from_allocating_vio(allocating_vio); - const struct thread_config *thread_config = get_vdo_thread_config(vdo); - - allocating_vio->allocation_attempts++; - result = allocate_and_lock_block(allocating_vio); - if (result != VDO_NO_SPACE) { - return result; - } - - if (allocating_vio->wait_for_clean_slab) { - result = wait_for_clean_slab(allocating_vio); - if (result != VDO_NO_SPACE) { - return result; - } - } - - if (allocating_vio->allocation_attempts >= - thread_config->physical_zone_count) { - if (allocating_vio->wait_for_clean_slab) { - // There were no free blocks in any zone, and no zone - // had slabs to scrub. - allocating_vio->allocation_callback(allocating_vio); - return VDO_SUCCESS; - } - - allocating_vio->wait_for_clean_slab = true; - allocating_vio->allocation_attempts = 0; - } - - // Try the next zone - zone_number = get_vdo_physical_zone_number(allocating_vio->zone) + 1; - if (zone_number == thread_config->physical_zone_count) { - zone_number = 0; - } - allocating_vio->zone = vdo->physical_zones[zone_number]; - vio_launch_physical_zone_callback(allocating_vio, - allocate_block_for_write); - return VDO_SUCCESS; -} - -/** - * Attempt to allocate a block. This callback is registered in - * vio_allocate_data_block() and allocate_block_in_zone(). - * - * @param completion The allocating_vio needing an allocation - **/ -static void allocate_block_for_write(struct vdo_completion *completion) -{ - int result; - struct allocating_vio *allocating_vio = as_allocating_vio(completion); - assert_vio_in_physical_zone(allocating_vio); - result = allocate_block_in_zone(allocating_vio); - if (result != VDO_SUCCESS) { - set_vdo_completion_result(completion, result); - allocating_vio->allocation_callback(allocating_vio); - } -} - -/**********************************************************************/ -void vio_allocate_data_block(struct allocating_vio *allocating_vio, - struct allocation_selector *selector, - enum pbn_lock_type write_lock_type, - allocation_callback *callback) -{ - struct vio *vio = allocating_vio_as_vio(allocating_vio); - - allocating_vio->write_lock_type = write_lock_type; - allocating_vio->allocation_callback = callback; - allocating_vio->allocation_attempts = 0; - allocating_vio->allocation = VDO_ZERO_BLOCK; - - allocating_vio->zone = - vio->vdo->physical_zones[get_next_vdo_allocation_zone(selector)]; - - vio_launch_physical_zone_callback(allocating_vio, - allocate_block_for_write); -} - -/**********************************************************************/ -void vio_release_allocation_lock(struct allocating_vio *allocating_vio) -{ - physical_block_number_t locked_pbn; - - assert_vio_in_physical_zone(allocating_vio); - locked_pbn = allocating_vio->allocation; - if (vdo_pbn_lock_has_provisional_reference(allocating_vio->allocation_lock)) { - allocating_vio->allocation = VDO_ZERO_BLOCK; - } - - release_vdo_physical_zone_pbn_lock(allocating_vio->zone, - locked_pbn, - UDS_FORGET(allocating_vio->allocation_lock)); -} - -/**********************************************************************/ -void vio_reset_allocation(struct allocating_vio *allocating_vio) -{ - ASSERT_LOG_ONLY(allocating_vio->allocation_lock == NULL, - "must not reset allocation while holding a PBN lock"); - - allocating_vio_as_vio(allocating_vio)->physical = VDO_ZERO_BLOCK; - allocating_vio->zone = NULL; - allocating_vio->allocation = VDO_ZERO_BLOCK; - allocating_vio->allocation_attempts = 0; - allocating_vio->wait_for_clean_slab = false; -} - -/**********************************************************************/ -int create_compressed_write_vio(struct vdo *vdo, - void *parent, - char *data, - struct allocating_vio **allocating_vio_ptr) -{ - struct bio *bio; - struct allocating_vio *allocating_vio; - struct vio *vio; - - // Compressed write vios should use direct allocation and not use the - // buffer pool, which is reserved for submissions from the linux block - // layer. - int result = UDS_ALLOCATE(1, struct allocating_vio, __func__, - &allocating_vio); - if (result != VDO_SUCCESS) { - uds_log_error("compressed write vio allocation failure %d", - result); - return result; - } - - result = vdo_create_bio(&bio); - if (result != VDO_SUCCESS) { - UDS_FREE(allocating_vio); - return result; - } - - vio = allocating_vio_as_vio(allocating_vio); - initialize_vio(vio, - bio, - VIO_TYPE_COMPRESSED_BLOCK, - VIO_PRIORITY_COMPRESSED_DATA, - parent, - vdo, - data); - *allocating_vio_ptr = allocating_vio; - return VDO_SUCCESS; -} diff --git a/vdo/allocatingVIO.h b/vdo/allocatingVIO.h deleted file mode 100644 index 102eaf5f..00000000 --- a/vdo/allocatingVIO.h +++ /dev/null @@ -1,268 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/allocatingVIO.h#19 $ - */ - -#ifndef ALLOCATING_VIO_H -#define ALLOCATING_VIO_H - -#include "permassert.h" - -#include "pbnLock.h" -#include "physicalZone.h" -#include "types.h" -#include "vdo.h" -#include "vio.h" -#include "waitQueue.h" - -typedef void allocation_callback(struct allocating_vio *allocation_vio); - -/** - * A vio which can receive an allocation from the block allocator. Currently, - * these are used both for servicing external data requests and for compressed - * block writes. - **/ -struct allocating_vio { - /** The underlying vio */ - struct vio vio; - - /** The wait_queue entry structure */ - struct waiter waiter; - - /** The physical zone in which to allocate a physical block */ - struct physical_zone *zone; - - /** The block allocated to this vio */ - physical_block_number_t allocation; - - /** - * If non-NULL, the pooled PBN lock held on the allocated block. Must - * be a write lock until the block has been written, after which it - * will become a read lock. - **/ - struct pbn_lock *allocation_lock; - - /** The type of write lock to obtain on the allocated block */ - enum pbn_lock_type write_lock_type; - - /** The number of zones in which this vio has attempted to allocate */ - zone_count_t allocation_attempts; - - /** Whether this vio should wait for a clean slab */ - bool wait_for_clean_slab; - - /** The function to call once allocation is complete */ - allocation_callback *allocation_callback; -}; - -/** - * Convert a vio to an allocating_vio. - * - * @param vio The vio to convert - * - * @return The vio as an allocating_vio - **/ -static inline struct allocating_vio *vio_as_allocating_vio(struct vio *vio) -{ - ASSERT_LOG_ONLY(((vio->type == VIO_TYPE_DATA) || - (vio->type == VIO_TYPE_COMPRESSED_BLOCK)), - "vio is an allocating_vio"); - return container_of(vio, struct allocating_vio, vio); -} - -/** - * Convert an allocating_vio to a vio. - * - * @param allocating_vio The allocating_vio to convert - * - * @return The allocating_vio as a vio - **/ -static inline struct vio * -allocating_vio_as_vio(struct allocating_vio *allocating_vio) -{ - return &allocating_vio->vio; -} - -/** - * Convert a generic vdo_completion to an allocating_vio. - * - * @param completion The completion to convert - * - * @return The completion as an allocating_vio - **/ -static inline struct allocating_vio * -as_allocating_vio(struct vdo_completion *completion) -{ - return vio_as_allocating_vio(as_vio(completion)); -} - -/** - * Convert an allocating_vio to a generic completion. - * - * @param allocating_vio The allocating_vio to convert - * - * @return The allocating_vio as a completion - **/ -static inline struct vdo_completion * -allocating_vio_as_completion(struct allocating_vio *allocating_vio) -{ - return vio_as_completion(allocating_vio_as_vio(allocating_vio)); -} - -/** - * Convert an allocating_vio to a generic wait queue entry. - * - * @param allocating_vio The allocating_vio to convert - * - * @return The allocating_vio as a wait queue entry - **/ -static inline struct waiter * -allocating_vio_as_waiter(struct allocating_vio *allocating_vio) -{ - return &allocating_vio->waiter; -} - -/** - * Convert an allocating_vio's generic wait queue entry back to the - * allocating_vio. - * - * @param waiter The wait queue entry to convert - * - * @return The wait queue entry as an allocating_vio - **/ -static inline struct allocating_vio * -waiter_as_allocating_vio(struct waiter *waiter) -{ - if (waiter == NULL) { - return NULL; - } - - return container_of(waiter, struct allocating_vio, waiter); -} - -/** - * Get the vdo from an allocating_vio. - * - * @param allocating_vio The allocating_vio from which to get the vdo - * - * @return The vdo to which an allocating_vio belongs - **/ -static inline struct vdo * -get_vdo_from_allocating_vio(struct allocating_vio *allocating_vio) -{ - return allocating_vio_as_vio(allocating_vio)->vdo; -} - -/** - * Check that an allocating_vio is running on the physical zone thread in - * which it did its allocation. - * - * @param allocating_vio The allocating_vio in question - **/ -static inline void -assert_vio_in_physical_zone(struct allocating_vio *allocating_vio) -{ - thread_id_t expected = - get_vdo_physical_zone_thread_id(allocating_vio->zone); - thread_id_t thread_id = vdo_get_callback_thread_id(); - ASSERT_LOG_ONLY((expected == thread_id), - "struct allocating_vio for allocated physical block %llu on thread %u, should be on thread %u", - (unsigned long long) allocating_vio->allocation, - thread_id, - expected); -} - -/** - * Set a callback as a physical block operation in an allocating_vio's - * allocated zone. - * - * @param allocating_vio The allocating_vio - * @param callback The callback to set - **/ -static inline void -vio_set_physical_zone_callback(struct allocating_vio *allocating_vio, - vdo_action *callback) -{ - set_vdo_completion_callback(allocating_vio_as_completion(allocating_vio), - callback, - get_vdo_physical_zone_thread_id(allocating_vio->zone)); -} - -/** - * Set a callback as a physical block operation in an allocating_vio's - * allocated zone and invoke it immediately. - * - * @param allocating_vio The allocating_vio - * @param callback The callback to invoke - **/ -static inline void -vio_launch_physical_zone_callback(struct allocating_vio *allocating_vio, - vdo_action *callback) -{ - vio_set_physical_zone_callback(allocating_vio, callback); - invoke_vdo_completion_callback(allocating_vio_as_completion(allocating_vio)); -} - -/** - * Allocate a data block to an allocating_vio. - * - * @param allocating_vio The allocating_vio which needs an allocation - * @param selector The allocation selector for deciding which physical - * zone to allocate from - * @param write_lock_type The type of write lock to obtain on the block - * @param callback The function to call once the allocation is complete - **/ -void vio_allocate_data_block(struct allocating_vio *allocating_vio, - struct allocation_selector *selector, - enum pbn_lock_type write_lock_type, - allocation_callback *callback); - -/** - * Release the PBN lock on the allocated block. If the reference to the locked - * block is still provisional, it will be released as well. - * - * @param allocating_vio The lock holder - **/ -void vio_release_allocation_lock(struct allocating_vio *allocating_vio); - -/** - * Reset an allocating_vio after it has done an allocation. - * - * @param allocating_vio The allocating_vio - **/ -void vio_reset_allocation(struct allocating_vio *allocating_vio); - -/** - * Create a new allocating_vio for compressed writes. - * - * @param [in] vdo The vdo - * @param [in] parent The parent to assign to the allocating_vio's - * completion - * @param [in] data The buffer - * @param [out] allocating_vio_ptr A pointer to hold new allocating_vio - * - * @return VDO_SUCCESS or an error - **/ -int __must_check -create_compressed_write_vio(struct vdo *vdo, - void *parent, - char *data, - struct allocating_vio **allocating_vio_ptr); - -#endif // ALLOCATING_VIO_H diff --git a/vdo/allocation-selector.c b/vdo/allocation-selector.c new file mode 100644 index 00000000..6abfe4f9 --- /dev/null +++ b/vdo/allocation-selector.c @@ -0,0 +1,70 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright Red Hat + */ + +#include "allocation-selector.h" + +#include "memory-alloc.h" + +#include "types.h" + +enum { + ALLOCATIONS_PER_ZONE = 128, +}; + +/** + * vdo_make_allocation_selector() - Make a new allocation selector. + * @physical_zone_count [in] The number of physical zones. + * @thread_id [in] The ID of the thread using this selector. + * @selector_ptr [out] A pointer to receive the new selector. + * + * Return: VDO_SUCCESS or an error. + */ +int vdo_make_allocation_selector(zone_count_t physical_zone_count, + thread_id_t thread_id, + struct allocation_selector **selector_ptr) +{ + struct allocation_selector *selector; + int result = UDS_ALLOCATE(1, + struct allocation_selector, + __func__, + &selector); + if (result != VDO_SUCCESS) { + return result; + } + + *selector = (struct allocation_selector) { + .next_allocation_zone = thread_id % physical_zone_count, + .last_physical_zone = physical_zone_count - 1, + }; + + *selector_ptr = selector; + return VDO_SUCCESS; +} + +/** + * vdo_get_next_allocation_zone() - Get number of the physical zone from + * which to allocate next. + * @selector: The selector to query. + * + * Return: The number of the physical zone from which to allocate. + */ +zone_count_t vdo_get_next_allocation_zone(struct allocation_selector *selector) +{ + if (selector->last_physical_zone > 0) { + if (selector->allocation_count < ALLOCATIONS_PER_ZONE) { + selector->allocation_count++; + } else { + selector->allocation_count = 1; + if (selector->next_allocation_zone < + selector->last_physical_zone) { + selector->next_allocation_zone++; + } else { + selector->next_allocation_zone = 0; + } + } + } + + return selector->next_allocation_zone; +} diff --git a/vdo/allocation-selector.h b/vdo/allocation-selector.h new file mode 100644 index 00000000..528aae61 --- /dev/null +++ b/vdo/allocation-selector.h @@ -0,0 +1,44 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright Red Hat + */ + +#ifndef ALLOCATION_SELECTOR_H +#define ALLOCATION_SELECTOR_H + +#include "completion.h" + +/** + * DOC: Allocation selectors + * + * An allocation_selector is used by any zone which does data block allocations. + * The selector is used to round-robin allocation requests to different + * physical zones. Currently, 128 allocations will be made to a given physical + * zone before switching to the next. + */ + +/** + * struct allocation_selector: Structure used to select which physical zone to + * allocate from. + */ +struct allocation_selector { + /** + * @allocation_count: The number of allocations done in the current + * zone. + */ + block_count_t allocation_count; + /** @next_allocation_zone: The physical zone to allocate from next. */ + zone_count_t next_allocation_zone; + /** @last_physical_cone: The number of the last physical zone. */ + zone_count_t last_physical_zone; +}; + +int __must_check +vdo_make_allocation_selector(zone_count_t physical_zone_count, + thread_id_t thread_id, + struct allocation_selector **selector_ptr); + +zone_count_t __must_check +vdo_get_next_allocation_zone(struct allocation_selector *selector); + +#endif /* ALLOCATION_SELECTOR_H */ diff --git a/vdo/allocationSelector.c b/vdo/allocationSelector.c deleted file mode 100644 index c1a46b95..00000000 --- a/vdo/allocationSelector.c +++ /dev/null @@ -1,74 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/allocationSelector.c#10 $ - */ - -#include "allocationSelector.h" -#include "allocationSelectorInternals.h" - -#include "memoryAlloc.h" - -#include "types.h" - -enum { - ALLOCATIONS_PER_ZONE = 128, -}; - -/**********************************************************************/ -int make_vdo_allocation_selector(zone_count_t physical_zone_count, - thread_id_t thread_id, - struct allocation_selector **selector_ptr) -{ - struct allocation_selector *selector; - int result = UDS_ALLOCATE(1, - struct allocation_selector, - __func__, - &selector); - if (result != VDO_SUCCESS) { - return result; - } - - *selector = (struct allocation_selector) { - .next_allocation_zone = thread_id % physical_zone_count, - .last_physical_zone = physical_zone_count - 1, - }; - - *selector_ptr = selector; - return VDO_SUCCESS; -} - -/**********************************************************************/ -zone_count_t get_next_vdo_allocation_zone(struct allocation_selector *selector) -{ - if (selector->last_physical_zone > 0) { - if (selector->allocation_count < ALLOCATIONS_PER_ZONE) { - selector->allocation_count++; - } else { - selector->allocation_count = 1; - if (selector->next_allocation_zone < - selector->last_physical_zone) { - selector->next_allocation_zone++; - } else { - selector->next_allocation_zone = 0; - } - } - } - - return selector->next_allocation_zone; -} diff --git a/vdo/allocationSelector.h b/vdo/allocationSelector.h deleted file mode 100644 index 524371e6..00000000 --- a/vdo/allocationSelector.h +++ /dev/null @@ -1,58 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/allocationSelector.h#7 $ - */ - -#ifndef ALLOCATION_SELECTOR_H -#define ALLOCATION_SELECTOR_H - -#include "completion.h" - -/** - * An allocation_selector is used by any zone which does data block allocations. - * The selector is used to round-robin allocation requests to different - * physical zones. Currently, 128 allocations will be made to a given physical - * zone before switching to the next. - **/ - -/** - * Make a new allocation selector. - * - * @param [in] physical_zone_count The number of physical zones - * @param [in] thread_id The ID of the thread using this selector - * @param [out] selector_ptr A pointer to receive the new selector - * - * @return VDO_SUCCESS or an error - **/ -int __must_check -make_vdo_allocation_selector(zone_count_t physical_zone_count, - thread_id_t thread_id, - struct allocation_selector **selector_ptr); - -/** - * Get number of the physical zone from which to allocate next. - * - * @param selector The selector to query - * - * @return The number of the physical zone from which to allocate - **/ -zone_count_t __must_check -get_next_vdo_allocation_zone(struct allocation_selector *selector); - -#endif /* ALLOCATION_SELECTOR_H */ diff --git a/vdo/allocationSelectorInternals.h b/vdo/allocationSelectorInternals.h deleted file mode 100644 index 7f0246be..00000000 --- a/vdo/allocationSelectorInternals.h +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/allocationSelectorInternals.h#2 $ - */ - -#ifndef ALLOCATION_SELECTOR_INTERNALS_H -#define ALLOCATION_SELECTOR_INTERNALS_H - -#include "types.h" - -/** Structure used to select which physical zone to allocate from */ -struct allocation_selector { - /** The number of allocations done in the current zone */ - block_count_t allocation_count; - /** The physical zone to allocate from next */ - zone_count_t next_allocation_zone; - /** The number of the last physical zone */ - zone_count_t last_physical_zone; -}; - -#endif /* ALLOCATION_SELECTOR_INTERNALS_H */ diff --git a/vdo/atomic-stats.h b/vdo/atomic-stats.h new file mode 100644 index 00000000..329788f2 --- /dev/null +++ b/vdo/atomic-stats.h @@ -0,0 +1,49 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright Red Hat + */ + +#ifndef ATOMIC_STATS_H +#define ATOMIC_STATS_H + +#include + +#include "statistics.h" + +/* Keep struct bio statistics atomically */ +struct atomic_bio_stats { + atomic64_t read; /* Number of not REQ_WRITE bios */ + atomic64_t write; /* Number of REQ_WRITE bios */ + atomic64_t discard; /* Number of REQ_DISCARD bios */ + atomic64_t flush; /* Number of REQ_FLUSH bios */ + atomic64_t empty_flush; /* Number of REQ_PREFLUSH bios without data */ + atomic64_t fua; /* Number of REQ_FUA bios */ +}; + +/* + * Counters are atomic since updates can arrive concurrently from arbitrary + * threads. + */ +struct atomic_statistics { + atomic64_t bios_submitted; + atomic64_t bios_completed; + atomic64_t dedupe_context_busy; + atomic64_t flush_out; + atomic64_t invalid_advice_pbn_count; + atomic64_t no_space_error_count; + atomic64_t read_only_error_count; + struct atomic_bio_stats bios_in; + struct atomic_bio_stats bios_in_partial; + struct atomic_bio_stats bios_out; + struct atomic_bio_stats bios_out_completed; + struct atomic_bio_stats bios_acknowledged; + struct atomic_bio_stats bios_acknowledged_partial; + struct atomic_bio_stats bios_meta; + struct atomic_bio_stats bios_meta_completed; + struct atomic_bio_stats bios_journal; + struct atomic_bio_stats bios_journal_completed; + struct atomic_bio_stats bios_page_cache; + struct atomic_bio_stats bios_page_cache_completed; +}; + +#endif /* ATOMIC_STATS_H */ diff --git a/vdo/atomicStats.h b/vdo/atomicStats.h deleted file mode 100644 index cdb458b6..00000000 --- a/vdo/atomicStats.h +++ /dev/null @@ -1,65 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/atomicStats.h#6 $ - */ - -#ifndef ATOMIC_STATS_H -#define ATOMIC_STATS_H - -#include - -#include "statistics.h" - -/* Keep struct bio statistics atomically */ -struct atomic_bio_stats { - atomic64_t read; // Number of not REQ_WRITE bios - atomic64_t write; // Number of REQ_WRITE bios - atomic64_t discard; // Number of REQ_DISCARD bios - atomic64_t flush; // Number of REQ_FLUSH bios - atomic64_t empty_flush; // Number of REQ_PREFLUSH bios without data - atomic64_t fua; // Number of REQ_FUA bios -}; - -/** - * Counters are atomic since updates can arrive concurrently from arbitrary - * threads. - **/ -struct atomic_statistics { - atomic64_t bios_submitted; - atomic64_t bios_completed; - atomic64_t dedupe_context_busy; - atomic64_t flush_out; - atomic64_t invalid_advice_pbn_count; - atomic64_t no_space_error_count; - atomic64_t read_only_error_count; - struct atomic_bio_stats bios_in; - struct atomic_bio_stats bios_in_partial; - struct atomic_bio_stats bios_out; - struct atomic_bio_stats bios_out_completed; - struct atomic_bio_stats bios_acknowledged; - struct atomic_bio_stats bios_acknowledged_partial; - struct atomic_bio_stats bios_meta; - struct atomic_bio_stats bios_meta_completed; - struct atomic_bio_stats bios_journal; - struct atomic_bio_stats bios_journal_completed; - struct atomic_bio_stats bios_page_cache; - struct atomic_bio_stats bios_page_cache_completed; -}; - -#endif /* ATOMIC_STATS_H */ diff --git a/vdo/batchProcessor.c b/vdo/batchProcessor.c deleted file mode 100644 index c62f65f0..00000000 --- a/vdo/batchProcessor.c +++ /dev/null @@ -1,236 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/kernel/batchProcessor.c#19 $ - */ - -#include "batchProcessor.h" - -#include - -#include "memoryAlloc.h" - -#include "constants.h" -#include "vdoInternal.h" - -#include "kernelLayer.h" - -/* - * On memory ordering: - * - * The producer thread does: enqueue item on queue (xchg, which is - * implicitly interlocked, then a store), memory barrier, then atomic - * cmpxchg of the state field. The x86 architecture spec says the - * xchg, store, lock-cmpxchg sequence cannot be reordered, but on - * architectures using load-linked and store-conditional for the - * cmpxchg, like AArch64, the LL can be reordered with the store, so - * we add a barrier. - * - * The consumer thread, when it is running out of work, does: read - * queue (find empty), set state, mfence, read queue again just to be - * sure. The set-state and read-queue cannot be reordered with respect - * to the mfence (but without the mfence, the read could be moved - * before the set). - * - * The xchg and mfence impose a total order across processors, and - * each processor sees the stores done by the other processor in the - * required order. If the xchg happens before the mfence, the - * consumer's "read queue again" operation will see the update. If the - * mfence happens first, the producer's "cmpxchg state" will see its - * updated value. - * - * These are the semantics implemented by memory set to WB (write-back - * caching) mode on x86-64. So, the simple analysis is that no wakeups - * should be missed. - * - * It's a little subtler with funnel queues, since one interrupted or - * delayed enqueue operation (see the commentary in funnel_queue_put) - * can cause another, concurrent enqueue operation to complete without - * actually making the entry visible to the consumer. In essence, one - * update makes no new work items visible to the consumer, and the - * other (when it eventually completes) makes two (or more) work items - * visible, and each one ensures that the consumer will process what - * it has made visible. - */ - -enum batch_processor_state { - BATCH_PROCESSOR_IDLE, - BATCH_PROCESSOR_ENQUEUED, -}; - -struct batch_processor { - struct mutex consumer_mutex; - struct funnel_queue *queue; - struct vdo_work_item work_item; - atomic_t state; - batch_processor_callback callback; - void *closure; - struct vdo *vdo; -}; - -static void schedule_batch_processing(struct batch_processor *batch); - -/** - * Apply the batch processing function to the accumulated set of - * objects. - * - * Runs in a "CPU queue". - * - * @param [in] item The work item embedded in the batch_processor - **/ -static void batch_processor_work(struct vdo_work_item *item) -{ - struct batch_processor *batch = - container_of(item, struct batch_processor, work_item); - bool need_reschedule; - - mutex_lock(&batch->consumer_mutex); - while (!is_funnel_queue_empty(batch->queue)) { - batch->callback(batch, batch->closure); - } - atomic_set(&batch->state, BATCH_PROCESSOR_IDLE); - // Pairs with the barrier in schedule_batch_processing(); see header - // comment on memory ordering. - smp_mb(); - need_reschedule = !is_funnel_queue_empty(batch->queue); - - mutex_unlock(&batch->consumer_mutex); - if (need_reschedule) { - schedule_batch_processing(batch); - } -} - -/** - * Ensure that the batch-processing function is scheduled to run. - * - * If we're the thread that switches the batch_processor state from - * idle to enqueued, we're the thread responsible for actually - * enqueueing it. If some other thread got there first, or it was - * already enqueued, it's not our problem. - * - * @param [in] batch The batch_processor control data - **/ -static void schedule_batch_processing(struct batch_processor *batch) -{ - enum batch_processor_state old_state; - bool do_schedule; - - /* - * We want this to be very fast in the common cases. - * - * In testing on our "mgh" class machines (HP ProLiant DL380p - * Gen8, Intel Xeon E5-2690, 2.9GHz), it appears that under - * some conditions it's a little faster to use a memory fence - * and then read the "state" field, skipping the cmpxchg if - * the state is already set to BATCH_PROCESSOR_ENQUEUED. - * (Sometimes slightly faster still if we prefetch the state - * field first.) Note that the read requires the fence, - * otherwise it could be executed before the preceding store - * by the funnel queue code to the "next" pointer, which can, - * very rarely, result in failing to issue a wakeup when - * needed. - * - * However, the gain is small, and in testing on our older - * "harvard" class machines (Intel Xeon X5680, 3.33GHz) it was - * a clear win to skip all of that and go right for the - * cmpxchg. - * - * Of course, the tradeoffs may be sensitive to the particular - * work going on, cache pressure, etc. - */ - - // Pairs with the barrier in batch_processor_work(); see header - // comment on memory ordering. - smp_mb__before_atomic(); - old_state = atomic_cmpxchg(&batch->state, BATCH_PROCESSOR_IDLE, - BATCH_PROCESSOR_ENQUEUED); - do_schedule = (old_state == BATCH_PROCESSOR_IDLE); - - if (do_schedule) { - enqueue_work_queue(batch->vdo->cpu_queue, &batch->work_item); - } -} - -/**********************************************************************/ -int make_batch_processor(struct vdo *vdo, - batch_processor_callback callback, - void *closure, - struct batch_processor **batch_ptr) -{ - struct batch_processor *batch; - - int result = - UDS_ALLOCATE(1, struct batch_processor, "batch_processor", &batch); - if (result != UDS_SUCCESS) { - return result; - } - result = make_funnel_queue(&batch->queue); - if (result != UDS_SUCCESS) { - UDS_FREE(batch); - return result; - } - - mutex_init(&batch->consumer_mutex); - setup_work_item(&batch->work_item, - batch_processor_work, - callback, - CPU_Q_ACTION_COMPLETE_VIO); - atomic_set(&batch->state, BATCH_PROCESSOR_IDLE); - batch->callback = callback; - batch->closure = closure; - batch->vdo = vdo; - - *batch_ptr = batch; - return UDS_SUCCESS; -} - -/**********************************************************************/ -void add_to_batch_processor(struct batch_processor *batch, - struct vdo_work_item *item) -{ - funnel_queue_put(batch->queue, &item->work_queue_entry_link); - schedule_batch_processing(batch); -} - -/**********************************************************************/ -struct vdo_work_item *next_batch_item(struct batch_processor *batch) -{ - struct funnel_queue_entry *fq_entry = funnel_queue_poll(batch->queue); - - if (fq_entry == NULL) { - return NULL; - } - return container_of(fq_entry, - struct vdo_work_item, - work_queue_entry_link); -} - -void free_batch_processor(struct batch_processor *batch) -{ - if (batch == NULL) { - return; - } - - // Pairs with the barrier in schedule_batch_processing(). Possibly not - // needed since it caters to an enqueue vs. free race. - smp_mb(); - BUG_ON(atomic_read(&batch->state) == BATCH_PROCESSOR_ENQUEUED); - - free_funnel_queue(UDS_FORGET(batch->queue)); - UDS_FREE(batch); -} diff --git a/vdo/batchProcessor.h b/vdo/batchProcessor.h deleted file mode 100644 index 80b364c2..00000000 --- a/vdo/batchProcessor.h +++ /dev/null @@ -1,93 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/kernel/batchProcessor.h#9 $ - */ - -#ifndef BATCHPROCESSOR_H -#define BATCHPROCESSOR_H - -#include "kernelTypes.h" - -/** - * Control data for managing collections of objects to be operated on - * by a specified function. May be used when the work function is - * lightweight enough or cache-contentious enough that it makes sense - * to try to accumulate multiple objects and operate on them all at - * once in one thread. - * - * The work function is run in one of the kernel layer's "CPU queues", - * and care is taken to ensure that only one invocation can be running - * or scheduled at any given time. It can loop calling next_batch_item - * repeatedly until there are no more objects to operate on. It should - * also call cond_resched_batch_processor now and then, to play nicely - * with the OS scheduler. - * - * Objects to operate on are manipulated through a funnel_queue_entry - * object which must be contained within them. - **/ -struct batch_processor; - -typedef void (*batch_processor_callback)(struct batch_processor *batch, - void *closure); - -/** - * Creates a batch-processor control structure. - * - * @param [in] vdo The vdo, used to enqueue work items - * @param [in] callback A function to process the accumulated objects - * @param [in] closure A private data pointer for use by the callback - * @param [out] batch_ptr Where to store the pointer to the new object - * - * @return UDS_SUCCESS or an error code - **/ -int make_batch_processor(struct vdo *vdo, - batch_processor_callback callback, - void *closure, - struct batch_processor **batch_ptr); - -/** - * Adds an object to the processing queue. - * - * If the callback function is not currently running or scheduled to be run, - * it gets queued up to run. - * - * @param [in] batch The batch-processor data - * @param [in] item The handle on the new object to add - **/ -void add_to_batch_processor(struct batch_processor *batch, - struct vdo_work_item *item); - -/** - * Fetches the next object in the processing queue. - * - * @param [in] batch The batch-processor data - * - * @return An object pointer or NULL - **/ -struct vdo_work_item * __must_check -next_batch_item(struct batch_processor *batch); - -/** - * Free the batch-processor data. - * - * @param [in] batch The batch-processor data - **/ -void free_batch_processor(struct batch_processor *batch); - -#endif /* BATCHPROCESSOR_H */ diff --git a/vdo/bio.c b/vdo/bio.c index ef2142fd..5bab8034 100644 --- a/vdo/bio.c +++ b/vdo/bio.c @@ -1,41 +1,23 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/kernel/bio.c#38 $ */ #include "bio.h" -#include - #include "logger.h" -#include "memoryAlloc.h" +#include "memory-alloc.h" #include "numeric.h" #include "permassert.h" -#include "atomicStats.h" -#include "kernelLayer.h" -#include "kvio.h" -#include "vdoInternal.h" +#include "atomic-stats.h" +#include "kernel-types.h" +#include "vdo.h" +#include "vio.h" -enum { INLINE_BVEC_COUNT = 2 }; - -/**********************************************************************/ +/* + * Copy bio data to a buffer + */ void vdo_bio_copy_data_in(struct bio *bio, char *data_ptr) { struct bio_vec biovec; @@ -47,7 +29,9 @@ void vdo_bio_copy_data_in(struct bio *bio, char *data_ptr) } } -/**********************************************************************/ +/* + * Copy a buffer into a bio's data + */ void vdo_bio_copy_data_out(struct bio *bio, char *data_ptr) { struct bio_vec biovec; @@ -59,7 +43,6 @@ void vdo_bio_copy_data_out(struct bio *bio, char *data_ptr) } } -/**********************************************************************/ void vdo_free_bio(struct bio *bio) { if (bio == NULL) { @@ -70,7 +53,11 @@ void vdo_free_bio(struct bio *bio) UDS_FREE(UDS_FORGET(bio)); } -/**********************************************************************/ +/*----------------------------------------------------------------*/ +/* + * Various counting functions for statistics. + * These are used for bios coming into VDO, as well as bios generated by VDO. + */ void vdo_count_bios(struct atomic_bio_stats *bio_stats, struct bio *bio) { if (((bio->bi_opf & REQ_PREFLUSH) != 0) && @@ -81,20 +68,22 @@ void vdo_count_bios(struct atomic_bio_stats *bio_stats, struct bio *bio) } switch (bio_op(bio)) { - case REQ_OP_WRITE: - atomic64_inc(&bio_stats->write); - break; - case REQ_OP_READ: - atomic64_inc(&bio_stats->read); - break; - case REQ_OP_DISCARD: - atomic64_inc(&bio_stats->discard); - break; - // All other operations are filtered out in kernelLayer.c, or - // not created by VDO, so shouldn't exist. - default: - ASSERT_LOG_ONLY(0, "Bio operation %d not a write, read, discard," - " or empty flush", bio_op(bio)); + case REQ_OP_WRITE: + atomic64_inc(&bio_stats->write); + break; + case REQ_OP_READ: + atomic64_inc(&bio_stats->read); + break; + case REQ_OP_DISCARD: + atomic64_inc(&bio_stats->discard); + break; + /* + * All other operations are filtered out in dmvdo.c, or + * not created by VDO, so shouldn't exist. + */ + default: + ASSERT_LOG_ONLY(0, "Bio operation %d not a write, read, discard, or empty flush", + bio_op(bio)); } if ((bio->bi_opf & REQ_PREFLUSH) != 0) { @@ -105,15 +94,9 @@ void vdo_count_bios(struct atomic_bio_stats *bio_stats, struct bio *bio) } } -/** - * Increments appropriate counters for bio completions - * - * @param vio the vio associated with the bio - * @param bio the bio to count - **/ static void count_all_bios_completed(struct vio *vio, struct bio *bio) { - struct atomic_statistics *stats = &vio->vdo->stats; + struct atomic_statistics *stats = &vdo_from_vio(vio)->stats; if (is_data_vio(vio)) { vdo_count_bios(&stats->bios_out_completed, bio); @@ -128,47 +111,56 @@ static void count_all_bios_completed(struct vio *vio, struct bio *bio) } } -/**********************************************************************/ void vdo_count_completed_bios(struct bio *bio) { struct vio *vio = (struct vio *) bio->bi_private; - atomic64_inc(&vio->vdo->stats.bios_completed); + atomic64_inc(&vdo_from_vio(vio)->stats.bios_completed); count_all_bios_completed(vio, bio); } -/**********************************************************************/ +/*----------------------------------------------------------------*/ + +/* + * Completes a bio relating to a vio, causing the vio completion callback + * to be invoked. + * + * This is used as the bi_end_io function for most of the bios created within + * VDO and submitted to the storage device. Exceptions are the flush code and + * the read-block code, both of which need to do work after the I/O completes. + */ void vdo_complete_async_bio(struct bio *bio) { struct vio *vio = (struct vio *) bio->bi_private; + vdo_count_completed_bios(bio); continue_vio(vio, vdo_get_bio_result(bio)); } -/** - * Set bio properties for a VDO read or write. - * - * @param bio The bio to reset - * @param vio The vio to which the bio belongs (may be NULL) - * @param callback The callback the bio should call when IO finishes - * @param bi_opf The operation and flags for the bio - * @param pbn The physical block number to write to - **/ -static void vdo_set_bio_properties(struct bio *bio, - struct vio *vio, - bio_end_io_t callback, - unsigned int bi_opf, - physical_block_number_t pbn) +/* + * Set bio properties for a VDO read or write. The vio associated with the bio + * may be NULL. + */ +void vdo_set_bio_properties(struct bio *bio, + struct vio *vio, + bio_end_io_t callback, + unsigned int bi_opf, + physical_block_number_t pbn) { bio->bi_private = vio; bio->bi_end_io = callback; bio->bi_opf = bi_opf; - if ((vio != NULL) && (pbn != GEOMETRY_BLOCK_LOCATION)) { - pbn -= vio->vdo->geometry.bio_offset; + if ((vio != NULL) && (pbn != VDO_GEOMETRY_BLOCK_LOCATION)) { + pbn -= vdo_from_vio(vio)->geometry.bio_offset; } - bio->bi_iter.bi_sector = block_to_sector(pbn); + bio->bi_iter.bi_sector = pbn * VDO_SECTORS_PER_BLOCK; } -/**********************************************************************/ +/* + * Prepares the bio to perform IO with the specified buffer. + * May only be used on a VDO-allocated bio, as it assumes the bio + * wraps a 4k buffer that is 4k aligned, but there does not have + * to be a vio associated with the bio. + */ int vdo_reset_bio_with_buffer(struct bio *bio, char *data, struct vio *vio, @@ -176,99 +168,86 @@ int vdo_reset_bio_with_buffer(struct bio *bio, unsigned int bi_opf, physical_block_number_t pbn) { - int bvec_count, result; -#if LINUX_VERSION_CODE >= KERNEL_VERSION(5,1,0) - struct page *page; - int bytes_added; + int bvec_count, result, offset, len, i; + unsigned short blocks; + + if (vio == NULL) { + blocks = 1; + } else if (vio->type == VIO_TYPE_DATA) { + result = ASSERT((vio->block_count == 1), + "Data vios may not span multiple blocks"); + if (result != VDO_SUCCESS) { + return result; + } + + blocks = 1; + } else { + blocks = vio->block_count; + } + +#ifdef RHEL_RELEASE_CODE +#define USE_ALTERNATE (RHEL_RELEASE_CODE < RHEL_RELEASE_VERSION(9,1)) #else - int len = VDO_BLOCK_SIZE; - int offset = offset_in_page(data); - unsigned int i; -#endif // >= 5.1.0 +#define USE_ALTERNATE (LINUX_VERSION_CODE < KERNEL_VERSION(5,18,0)) +#endif - bio_reset(bio); // Memsets most of the bio to reset most fields. +#if USE_ALTERNATE + bio_reset(bio); +#else + bio_reset(bio, bio->bi_bdev, bi_opf); +#endif vdo_set_bio_properties(bio, vio, callback, bi_opf, pbn); if (data == NULL) { return VDO_SUCCESS; } - // Make sure we use our own inlined iovecs. bio->bi_io_vec = bio->bi_inline_vecs; - bio->bi_max_vecs = INLINE_BVEC_COUNT; - - bvec_count = (offset_in_page(data) + VDO_BLOCK_SIZE + - PAGE_SIZE - 1) >> PAGE_SHIFT; - result = ASSERT(bvec_count <= INLINE_BVEC_COUNT, - "VDO-allocated buffers lie on max %d pages, not %d", - INLINE_BVEC_COUNT, bvec_count); - if (result != UDS_SUCCESS) { - return result; - } - - -#if LINUX_VERSION_CODE >= KERNEL_VERSION(5,1,0) - // bio_add_page() can take any contiguous buffer on any number of - // pages and add it in one shot. - page = is_vmalloc_addr(data) ? vmalloc_to_page(data) : - virt_to_page(data); - bytes_added = bio_add_page(bio, page, VDO_BLOCK_SIZE, - offset_in_page(data)); - - if (bytes_added != VDO_BLOCK_SIZE) { - return uds_log_error_strerror(VDO_BIO_CREATION_FAILED, - "Could only add %i bytes to bio", - bytes_added); - } -#else - // On pre-5.1 kernels, we have to add one page at a time to the bio. + bio->bi_max_vecs = blocks + 1; + len = VDO_BLOCK_SIZE * blocks; + offset = offset_in_page(data); + bvec_count = DIV_ROUND_UP(offset + len, PAGE_SIZE); + + /* + * If we knew that data was always on one page, or contiguous pages, + * we wouldn't need the loop. But if we're using vmalloc, it's not + * impossible that the data is in different pages that can't be + * merged in bio_add_page... + */ for (i = 0; (i < bvec_count) && (len > 0); i++) { - unsigned int bytes = PAGE_SIZE - offset; struct page *page; int bytes_added; + int bytes = PAGE_SIZE - offset; if (bytes > len) { bytes = len; } - page = is_vmalloc_addr(data) ? vmalloc_to_page(data) : - virt_to_page(data); + page = is_vmalloc_addr(data) ? vmalloc_to_page(data) + : virt_to_page(data); bytes_added = bio_add_page(bio, page, bytes, offset); if (bytes_added != bytes) { return uds_log_error_strerror(VDO_BIO_CREATION_FAILED, "Could only add %i bytes to bio", - bytes_added); + bytes_added); } data += bytes; len -= bytes; offset = 0; } -#endif // >= 5.1.0 - return VDO_SUCCESS; -} -/**********************************************************************/ -void vdo_reset_bio_with_user_bio(struct bio *bio, - struct bio *user_bio, - struct vio *vio, - bio_end_io_t callback, - unsigned int bi_opf, - physical_block_number_t pbn) -{ - // Use __bio_clone_fast() to copy over the original bio iovec - // information and opflags. - bio_reset(bio); - __bio_clone_fast(bio, user_bio); - vdo_set_bio_properties(bio, vio, callback, bi_opf, pbn); + return VDO_SUCCESS; } -/**********************************************************************/ -int vdo_create_bio(struct bio **bio_ptr) +int vdo_create_multi_block_bio(block_count_t size, struct bio **bio_ptr) { struct bio *bio = NULL; - int result = UDS_ALLOCATE_EXTENDED(struct bio, INLINE_BVEC_COUNT, - struct bio_vec, "bio", &bio); + int result = UDS_ALLOCATE_EXTENDED(struct bio, + size + 1, + struct bio_vec, + "bio", + &bio); if (result != VDO_SUCCESS) { return result; } diff --git a/vdo/bio.h b/vdo/bio.h index 8f471e76..83b1bf67 100644 --- a/vdo/bio.h +++ b/vdo/bio.h @@ -1,22 +1,6 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/kernel/bio.h#11 $ */ #ifndef BIO_H @@ -25,98 +9,42 @@ #include #include -#include "kernelTypes.h" +#include "kernel-types.h" -/** - * Copy the bio data to a char array. - * - * @param bio The bio to copy the data from - * @param data_ptr The local array to copy the data to - **/ void vdo_bio_copy_data_in(struct bio *bio, char *data_ptr); - -/** - * Copy a char array to the bio data. - * - * @param bio The bio to copy the data to - * @param data_ptr The local array to copy the data from - **/ void vdo_bio_copy_data_out(struct bio *bio, char *data_ptr); -/** - * Get the error from the bio. - * - * @param bio The bio - * - * @return the bio's error if any - **/ static inline int vdo_get_bio_result(struct bio *bio) { return blk_status_to_errno(bio->bi_status); } -/** - * Tell the kernel we've completed processing of this bio. - * - * @param bio The bio to complete - * @param error A system error code, or 0 for success - **/ static inline void vdo_complete_bio(struct bio *bio, int error) { bio->bi_status = errno_to_blk_status(error); bio_endio(bio); } -/** - * Frees up a bio structure - * - * @param bio The bio to free - **/ +int vdo_create_multi_block_bio(block_count_t size, struct bio **bio_ptr); + +static inline int vdo_create_bio(struct bio **bio_ptr) +{ + return vdo_create_multi_block_bio(1, bio_ptr); +} + void vdo_free_bio(struct bio *bio); -/** - * Count the statistics for the bios. This is used for calls into VDO and - * for calls out of VDO. - * - * @param bio_stats Statistics structure to update - * @param bio The bio - **/ void vdo_count_bios(struct atomic_bio_stats *bio_stats, struct bio *bio); - -/** - * Does all the appropriate accounting for bio completions - * - * @param bio the bio to count - **/ void vdo_count_completed_bios(struct bio *bio); -/** - * Completes a bio relating to a vio, causing the completion callback to be - * invoked. - * - * This is used as the bi_end_io function for most of the bios created within - * VDO and submitted to the storage device. Exceptions are the flush code and - * the read-block code, both of which need to regain control in the kernel - * layer after the I/O is completed. - * - * @param bio The bio to complete - **/ void vdo_complete_async_bio(struct bio *bio); -/** - * Reset a bio wholly, preparing it to perform an IO. May only be used on a - * VDO-allocated bio, as it assumes the bio wraps a 4k buffer that is 4k - * aligned. - * - * @param bio The bio to reset - * @param data The data the bio should wrap - * @param vio The vio to which this bio belongs (may be NULL) - * @param callback The callback the bio should call when IO finishes - * @param bi_opf The operation and flags for the bio - * @param pbn The physical block number to write to - * - * @return VDO_SUCCESS or an error - **/ +void vdo_set_bio_properties(struct bio *bio, + struct vio *vio, + bio_end_io_t callback, + unsigned int bi_opf, + physical_block_number_t pbn); + int vdo_reset_bio_with_buffer(struct bio *bio, char *data, struct vio *vio, @@ -124,31 +52,4 @@ int vdo_reset_bio_with_buffer(struct bio *bio, unsigned int bi_opf, physical_block_number_t pbn); -/** - * Clone a user bio, then edit our copy to fit our needs. - * - * @param bio The bio to reset - * @param user_bio The user bio to clone - * @param vio The vio to which our bio belongs (may be NULL) - * @param callback The callback our bio should call when IO finishes - * @param bi_opf The operation and flags for our bio - * @param pbn The physical block number to write to - **/ -void vdo_reset_bio_with_user_bio(struct bio *bio, - struct bio *user_bio, - struct vio *vio, - bio_end_io_t callback, - unsigned int bi_opf, - physical_block_number_t pbn); - -/** - * Create a new bio structure, which is guaranteed to be able to wrap any - * contiguous buffer for IO. - * - * @param [out] bio_ptr A pointer to hold new bio - * - * @return VDO_SUCCESS or an error - **/ -int vdo_create_bio(struct bio **bio_ptr); - #endif /* BIO_H */ diff --git a/vdo/blockAllocator.c b/vdo/block-allocator.c similarity index 53% rename from vdo/blockAllocator.c rename to vdo/block-allocator.c index 70100482..135fbb71 100644 --- a/vdo/blockAllocator.c +++ b/vdo/block-allocator.c @@ -1,56 +1,41 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/blockAllocator.c#45 $ */ -#include "blockAllocatorInternals.h" +#include "block-allocator.h" #include "logger.h" -#include "memoryAlloc.h" +#include "memory-alloc.h" #include "permassert.h" -#include "adminState.h" -#include "actionManager.h" +#include "admin-state.h" +#include "action-manager.h" #include "completion.h" +#include "constants.h" #include "heap.h" -#include "numUtils.h" -#include "priorityTable.h" -#include "readOnlyNotifier.h" -#include "refCounts.h" +#include "num-utils.h" +#include "priority-table.h" +#include "read-only-notifier.h" +#include "ref-counts.h" #include "slab.h" -#include "slabDepotInternals.h" -#include "slabIterator.h" -#include "slabJournalEraser.h" -#include "slabJournalInternals.h" -#include "slabScrubber.h" -#include "slabSummary.h" +#include "slab-depot.h" +#include "slab-iterator.h" +#include "slab-journal.h" +#include "slab-scrubber.h" +#include "slab-summary.h" #include "vdo.h" -#include "vdoRecovery.h" +#include "vdo-recovery.h" #include "vio.h" -#include "vioPool.h" +#include "vio-pool.h" + +struct slab_journal_eraser { + struct vdo_completion *parent; + struct dm_kcopyd_client *client; + block_count_t blocks; + struct slab_iterator slabs; +}; -/** - * Assert that a block allocator function was called from the correct thread. - * - * @param thread_id The allocator's thread id - * @param function_name The name of the function - **/ static inline void assert_on_allocator_thread(thread_id_t thread_id, const char *function_name) { @@ -59,16 +44,11 @@ static inline void assert_on_allocator_thread(thread_id_t thread_id, function_name); } -/** - * Get the priority for a slab in the allocator's slab queue. Slabs are - * essentially prioritized by an approximation of the number of free blocks in - * the slab so slabs with lots of free blocks with be opened for allocation - * before slabs that have few free blocks. - * - * @param slab The slab whose queue priority is desired - * - * @return the queue priority of the slab - **/ +/* + * Slabs are essentially prioritized by an approximation of the number of free + * blocks in the slab so slabs with lots of free blocks with be opened for + * allocation before slabs that have few free blocks. + */ static unsigned int calculate_slab_priority(struct vdo_slab *slab) { block_count_t free_blocks = get_slab_free_block_count(slab); @@ -76,41 +56,39 @@ static unsigned int calculate_slab_priority(struct vdo_slab *slab) slab->allocator->unopened_slab_priority; unsigned int priority; - // Slabs that are completely full must be the only ones with the lowest - // priority: zero. - if (free_blocks == 0) { - return 0; - } - - /* - * Slabs that have never been opened (empty, newly initialized, never - * been written to) have lower priority than previously opened slabs - * that have a signficant number of free blocks. This ranking causes - * VDO to avoid writing physical blocks for the first time until there - * are very few free blocks that have been previously written to. That - * policy makes VDO a better client of any underlying storage that is - * thinly-provisioned [VDOSTORY-123]. - */ - if (is_vdo_slab_journal_blank(slab->journal)) { - return unopened_slab_priority; - } - /* + * Wholly full slabs must be the only ones with lowest priority, 0. + * + * Slabs that have never been opened (empty, newly initialized, and + * never been written to) have lower priority than previously opened + * slabs that have a significant number of free blocks. This ranking + * causes VDO to avoid writing physical blocks for the first time + * unless there are very few free blocks that have been previously + * written to. + * + * Since VDO doesn't discard blocks currently, reusing previously + * written blocks makes VDO a better client of any underlying storage + * that is thinly-provisioned (though discarding would be better). + * * For all other slabs, the priority is derived from the logarithm of * the number of free blocks. Slabs with the same order of magnitude of * free blocks have the same priority. With 2^23 blocks, the priority * will range from 1 to 25. The reserved unopened_slab_priority divides * the range and is skipped by the logarithmic mapping. */ - priority = (1 + log_base_two(free_blocks)); + + if (free_blocks == 0) { + return 0; + } + + if (vdo_is_slab_journal_blank(slab->journal)) { + return unopened_slab_priority; + } + + priority = (1 + ilog2(free_blocks)); return ((priority < unopened_slab_priority) ? priority : priority + 1); } -/** - * Add a slab to the priority queue of slabs available for allocation. - * - * @param slab The slab to prioritize - **/ static void prioritize_slab(struct vdo_slab *slab) { ASSERT_LOG_ONLY(list_empty(&slab->allocq_entry), @@ -121,21 +99,13 @@ static void prioritize_slab(struct vdo_slab *slab) &slab->allocq_entry); } -/**********************************************************************/ -void register_vdo_slab_with_allocator(struct block_allocator *allocator, +void vdo_register_slab_with_allocator(struct block_allocator *allocator, struct vdo_slab *slab) { allocator->slab_count++; allocator->last_slab = slab->slab_number; } -/** - * Get an iterator over all the slabs in the allocator. - * - * @param allocator The allocator - * - * @return An iterator over the allocator's slabs - **/ static struct slab_iterator get_slab_iterator(const struct block_allocator *allocator) { @@ -145,38 +115,32 @@ get_slab_iterator(const struct block_allocator *allocator) allocator->depot->zone_count); } -/** - * Notify a block allocator that the VDO has entered read-only mode. - * +/* * Implements vdo_read_only_notification. - * - * @param listener The block allocator - * @param parent The completion to notify in order to acknowledge the - * notification - **/ + */ static void notify_block_allocator_of_read_only_mode(void *listener, struct vdo_completion *parent) { struct block_allocator *allocator = listener; struct slab_iterator iterator; + assert_on_allocator_thread(allocator->thread_id, __func__); iterator = get_slab_iterator(allocator); while (vdo_has_next_slab(&iterator)) { struct vdo_slab *slab = vdo_next_slab(&iterator); - abort_vdo_slab_journal_waiters(slab->journal); + + vdo_abort_slab_journal_waiters(slab->journal); } - complete_vdo_completion(parent); + vdo_complete_completion(parent); } -/** - * Construct allocator metadata vios. - * +/* * Implements vio_constructor - **/ + */ static int __must_check -make_vdo_block_allocator_pool_vios(struct vdo *vdo, +vdo_make_block_allocator_pool_vios(struct vdo *vdo, void *parent, void *buffer, struct vio **vio_ptr) @@ -189,30 +153,18 @@ make_vdo_block_allocator_pool_vios(struct vdo *vdo, vio_ptr); } -/** - * Allocate those component of the block allocator which are needed only at - * load time, not at format time. - * - * @param allocator The allocator - * @param vdo The VDO - * @param vio_pool_size The vio pool size - * - * @return VDO_SUCCESS or an error - **/ static int allocate_components(struct block_allocator *allocator, struct vdo *vdo, block_count_t vio_pool_size) { struct slab_depot *depot = allocator->depot; - // The number of data blocks is the maximum number of free blocks that - // could be used in calculate_slab_priority(). block_count_t slab_journal_size = depot->slab_config.slab_journal_blocks; block_count_t max_free_blocks = depot->slab_config.data_blocks; - unsigned int max_priority = (2 + log_base_two(max_free_blocks)); + unsigned int max_priority = (2 + ilog2(max_free_blocks)); int result; - result = register_vdo_read_only_listener(allocator->read_only_notifier, + result = vdo_register_read_only_listener(allocator->read_only_notifier, allocator, notify_block_allocator_of_read_only_mode, allocator->thread_id); @@ -220,22 +172,23 @@ static int allocate_components(struct block_allocator *allocator, return result; } - initialize_vdo_completion(&allocator->completion, vdo, + vdo_initialize_completion(&allocator->completion, vdo, VDO_BLOCK_ALLOCATOR_COMPLETION); allocator->summary = - get_vdo_slab_summary_for_zone(depot, allocator->zone_number); + vdo_get_slab_summary_for_zone(depot->slab_summary, + allocator->zone_number); result = make_vio_pool(vdo, vio_pool_size, allocator->thread_id, - make_vdo_block_allocator_pool_vios, + vdo_make_block_allocator_pool_vios, NULL, &allocator->vio_pool); if (result != VDO_SUCCESS) { return result; } - result = make_vdo_slab_scrubber(vdo, + result = vdo_make_slab_scrubber(vdo, slab_journal_size, allocator->read_only_notifier, &allocator->slab_scrubber); @@ -250,31 +203,29 @@ static int allocate_components(struct block_allocator *allocator, } /* - * VDOSTORY-123 requires that we try to open slabs that already have - * allocated blocks in preference to slabs that have never been opened. - * For reasons we have not been able to fully understand, performance - * tests on SSD harvards have been very sensitive (50% reduction in - * test throughput) to very slight differences in the timing and - * locality of block allocation. Assigning a low priority to unopened - * slabs (max_priority/2, say) would be ideal for the story, but - * anything less than a very high threshold (max_priority - 1) hurts - * PMI results. + * Performing well atop thin provisioned storage requires either that + * VDO discards freed blocks, or that the block allocator try to use + * slabs that already have allocated blocks in preference to slabs that + * have never been opened. For reasons we have not been able to fully + * understand, some SSD machines have been have been very sensitive + * (50% reduction in test throughput) to very slight differences in the + * timing and locality of block allocation. Assigning a low priority + * to unopened slabs (max_priority/2, say) would be ideal for the + * story, but anything less than a very high threshold (max_priority - + * 1) hurts on these machines. * * This sets the free block threshold for preferring to open an * unopened slab to the binary floor of 3/4ths the total number of - * datablocks in a slab, which will generally evaluate to about half - * the slab size, but avoids degenerate behavior in unit tests where - * the number of data blocks is artificially constrained to a power of - * two. + * data blocks in a slab, which will generally evaluate to about half + * the slab size. */ allocator->unopened_slab_priority = - (1 + log_base_two((max_free_blocks * 3) / 4)); + (1 + ilog2((max_free_blocks * 3) / 4)); return VDO_SUCCESS; } -/**********************************************************************/ -int make_vdo_block_allocator(struct slab_depot *depot, +int vdo_make_block_allocator(struct slab_depot *depot, zone_count_t zone_number, thread_id_t thread_id, nonce_t nonce, @@ -285,6 +236,7 @@ int make_vdo_block_allocator(struct slab_depot *depot, { struct block_allocator *allocator; int result = UDS_ALLOCATE(1, struct block_allocator, __func__, &allocator); + if (result != VDO_SUCCESS) { return result; } @@ -295,12 +247,12 @@ int make_vdo_block_allocator(struct slab_depot *depot, allocator->nonce = nonce; allocator->read_only_notifier = read_only_notifier; INIT_LIST_HEAD(&allocator->dirty_slab_journals); - set_vdo_admin_state_code(&allocator->state, + vdo_set_admin_state_code(&allocator->state, VDO_ADMIN_STATE_NORMAL_OPERATION); result = allocate_components(allocator, vdo, vio_pool_size); if (result != VDO_SUCCESS) { - free_vdo_block_allocator(allocator); + vdo_free_block_allocator(allocator); return result; } @@ -308,129 +260,102 @@ int make_vdo_block_allocator(struct slab_depot *depot, return VDO_SUCCESS; } -/**********************************************************************/ -void free_vdo_block_allocator(struct block_allocator *allocator) +void vdo_free_block_allocator(struct block_allocator *allocator) { if (allocator == NULL) { return; } - free_vdo_slab_scrubber(UDS_FORGET(allocator->slab_scrubber)); + if (allocator->eraser != NULL) { + dm_kcopyd_client_destroy(UDS_FORGET(allocator->eraser)); + } + + vdo_free_slab_scrubber(UDS_FORGET(allocator->slab_scrubber)); free_vio_pool(UDS_FORGET(allocator->vio_pool)); free_priority_table(UDS_FORGET(allocator->prioritized_slabs)); UDS_FREE(allocator); } - -/** - * Get the maximum number of data blocks that can be allocated. - * - * @param allocator The block allocator to query - * - * @return The number of data blocks that can be allocated - **/ -static inline block_count_t __must_check -get_data_block_count(const struct block_allocator *allocator) -{ - return (allocator->slab_count * - allocator->depot->slab_config.data_blocks); -} - -/**********************************************************************/ -block_count_t get_vdo_allocated_blocks(const struct block_allocator *allocator) -{ - return READ_ONCE(allocator->allocated_blocks); -} - -/**********************************************************************/ -block_count_t -get_vdo_unrecovered_slab_count(const struct block_allocator *allocator) -{ - return get_scrubber_vdo_slab_count(allocator->slab_scrubber); -} - -/**********************************************************************/ -void queue_vdo_slab(struct vdo_slab *slab) +/* + * Queue a slab for allocation or scrubbing. + */ +void vdo_queue_slab(struct vdo_slab *slab) { struct block_allocator *allocator = slab->allocator; block_count_t free_blocks; int result; + ASSERT_LOG_ONLY(list_empty(&slab->allocq_entry), "a requeued slab must not already be on a ring"); free_blocks = get_slab_free_block_count(slab); result = ASSERT((free_blocks <= allocator->depot->slab_config.data_blocks), - "rebuilt slab %u must have a valid free block count (has %llu, expected maximum %llu)", - slab->slab_number, - (unsigned long long) free_blocks, - (unsigned long long) allocator->depot->slab_config.data_blocks); + "rebuilt slab %u must have a valid free block count (has %llu, expected maximum %llu)", + slab->slab_number, + (unsigned long long) free_blocks, + (unsigned long long) allocator->depot->slab_config.data_blocks); if (result != VDO_SUCCESS) { vdo_enter_read_only_mode(allocator->read_only_notifier, result); return; } - if (is_unrecovered_vdo_slab(slab)) { + if (vdo_is_unrecovered_slab(slab)) { vdo_register_slab_for_scrubbing(allocator->slab_scrubber, slab, false); return; } - if (!is_vdo_slab_resuming(slab)) { - // If the slab is resuming, we've already accounted for it - // here, so don't do it again. + if (!vdo_is_slab_resuming(slab)) { + /* + * If the slab is resuming, we've already accounted for it + * here, so don't do it again. + * FIXME: under what situation would the slab be resuming here? + */ WRITE_ONCE(allocator->allocated_blocks, allocator->allocated_blocks - free_blocks); - if (!is_vdo_slab_journal_blank(slab->journal)) { + if (!vdo_is_slab_journal_blank(slab->journal)) { WRITE_ONCE(allocator->statistics.slabs_opened, allocator->statistics.slabs_opened + 1); } } - // All slabs are kept in a priority queue for allocation. + vdo_resume_slab_journal(slab->journal); prioritize_slab(slab); } -/**********************************************************************/ -void adjust_vdo_free_block_count(struct vdo_slab *slab, bool increment) +/* + * Adjust the free block count and (if needed) reprioritize the slab. + * @increment should be true if the free block count went up. + */ +void vdo_adjust_free_block_count(struct vdo_slab *slab, bool increment) { struct block_allocator *allocator = slab->allocator; - // The sense of increment is reversed since allocations are being - // counted. + WRITE_ONCE(allocator->allocated_blocks, allocator->allocated_blocks + (increment ? -1 : 1)); - // The open slab doesn't need to be reprioritized until it is closed. + /* The open slab doesn't need to be reprioritized until it is closed. */ if (slab == allocator->open_slab) { return; } - // The slab priority rarely changes; if no change, then don't requeue - // it. + /* + * Don't bother adjusting the priority table if unneeded. + */ if (slab->priority == calculate_slab_priority(slab)) { return; } - // Reprioritize the slab to reflect the new free block count by - // removing it from the table and re-enqueuing it with the new - // priority. + /* + * Reprioritize the slab to reflect the new free block count by + * removing it from the table and re-enqueuing it with the new + * priority. + */ priority_table_remove(allocator->prioritized_slabs, &slab->allocq_entry); prioritize_slab(slab); } -/** - * Allocate the next free physical block in a slab. - * - * The block allocated will have a provisional reference and the - * reference must be either confirmed with a subsequent increment - * or vacated with a subsequent decrement of the reference count. - * - * @param [in] slab The slab - * @param [out] block_number_ptr A pointer to receive the allocated block - * number - * - * @return UDS_SUCCESS or an error code - **/ static int allocate_slab_block(struct vdo_slab *slab, physical_block_number_t *block_number_ptr) { @@ -441,48 +366,58 @@ static int allocate_slab_block(struct vdo_slab *slab, return result; } - adjust_vdo_free_block_count(slab, false); + vdo_adjust_free_block_count(slab, false); *block_number_ptr = pbn; return VDO_SUCCESS; } -/**********************************************************************/ -int allocate_vdo_block(struct block_allocator *allocator, +/* + * The block allocated will have a provisional reference and the reference + * must be either confirmed with a subsequent increment or vacated with a + * subsequent decrement via vdo_release_block_reference(). + */ +int vdo_allocate_block(struct block_allocator *allocator, physical_block_number_t *block_number_ptr) { if (allocator->open_slab != NULL) { - // Try to allocate the next block in the currently open slab. + /* Try to allocate the next block in the currently open slab. */ int result = allocate_slab_block(allocator->open_slab, block_number_ptr); if ((result == VDO_SUCCESS) || (result != VDO_NO_SPACE)) { return result; } - // Put the exhausted open slab back into the priority table. + /* Put the exhausted open slab back into the priority table. */ prioritize_slab(allocator->open_slab); } - // Remove the highest priority slab from the priority table and make it - // the open slab. + /* + * Remove the highest priority slab from the priority table and make it + * the open slab. + */ allocator->open_slab = vdo_slab_from_list_entry(priority_table_dequeue(allocator->prioritized_slabs)); - open_vdo_slab(allocator->open_slab); + vdo_open_slab(allocator->open_slab); - // Try allocating again. If we're out of space immediately after - // opening a slab, then every slab must be fully allocated. + /* + * Try allocating again. If we're out of space immediately after + * opening a slab, then every slab must be fully allocated. + */ return allocate_slab_block(allocator->open_slab, block_number_ptr); } -/**********************************************************************/ -void release_vdo_block_reference(struct block_allocator *allocator, +/* + * Release an unused provisional reference. + */ +void vdo_release_block_reference(struct block_allocator *allocator, physical_block_number_t pbn, const char *why) { struct vdo_slab *slab; int result; struct reference_operation operation = { - .type = DATA_DECREMENT, + .type = VDO_JOURNAL_DATA_DECREMENT, .pbn = pbn, }; @@ -490,8 +425,8 @@ void release_vdo_block_reference(struct block_allocator *allocator, return; } - slab = get_vdo_slab(allocator->depot, pbn); - result = modify_vdo_slab_reference_count(slab, NULL, operation); + slab = vdo_get_slab(allocator->depot, pbn); + result = vdo_modify_slab_reference_count(slab, NULL, operation); if (result != VDO_SUCCESS) { uds_log_error_strerror(result, "Failed to release reference to %s physical block %llu", @@ -500,7 +435,7 @@ void release_vdo_block_reference(struct block_allocator *allocator, } } -/** +/* * This is a heap_comparator function that orders slab_status * structures using the 'is_clean' field as the primary key and the * 'emptiness' field as the secondary key. @@ -517,7 +452,7 @@ void release_vdo_block_reference(struct block_allocator *allocator, * @return 1 if the first item is cleaner or emptier than the second; * 0 if the two items are equally clean and empty; -1 otherwise - **/ + */ static int compare_slab_statuses(const void *item1, const void *item2) { const struct slab_status *info1 = (const struct slab_status *) item1; @@ -529,12 +464,12 @@ static int compare_slab_statuses(const void *item1, const void *item2) if (info1->emptiness != info2->emptiness) { return ((info1->emptiness > info2->emptiness) ? 1 : -1); } - return ((info1->slab_number < info2->slab_number) ? 1 : -1); + return (info1->slab_number < info2->slab_number) ? 1 : -1; } -/** - * Swap two slab_status structures. Implements heap_swapper. - **/ +/* + * Implements heap_swapper. + */ static void swap_slab_statuses(void *item1, void *item2) { struct slab_status *info1 = item1; @@ -544,76 +479,64 @@ static void swap_slab_statuses(void *item1, void *item2) *info2 = temp; } -/** - * Convert a generic vdo_completion to the block_allocator containing it. - * - * @param completion The completion to convert - * - * @return The block allocator containing the completion - **/ static struct block_allocator * as_block_allocator(struct vdo_completion *completion) { - assert_vdo_completion_type(completion->type, + vdo_assert_completion_type(completion->type, VDO_BLOCK_ALLOCATOR_COMPLETION); return container_of(completion, struct block_allocator, completion); } -/** - * Inform the allocator that a slab action has finished on some slab. This - * callback is registered in apply_to_slabs(). - * - * @param completion The allocator completion - **/ +/* + * Inform the slab actor that a action has finished on some slab; used by + * apply_to_slabs(). + */ static void slab_action_callback(struct vdo_completion *completion) { struct block_allocator *allocator = as_block_allocator(completion); struct slab_actor *actor = &allocator->slab_actor; + if (--actor->slab_action_count == 0) { actor->callback(completion); return; } - reset_vdo_completion(completion); + vdo_reset_completion(completion); } -/** - * Preserve the error from part of an administrative action and continue. - * - * @param completion The allocator completion - **/ +/* + * Preserve the error from part of an action and continue. + */ static void handle_operation_error(struct vdo_completion *completion) { struct block_allocator *allocator = as_block_allocator(completion); - set_vdo_operation_result(&allocator->state, completion->result); + + vdo_set_operation_result(&allocator->state, completion->result); completion->callback(completion); } -/** - * Perform an administrative action on each of an allocator's slabs in - * parallel. - * - * @param allocator The allocator - * @param callback The method to call when the action is complete on every - * slab - **/ +/* + * Perform an action on each of an allocator's slabs in parallel. + */ static void apply_to_slabs(struct block_allocator *allocator, vdo_action *callback) { struct slab_iterator iterator; - prepare_vdo_completion(&allocator->completion, + vdo_prepare_completion(&allocator->completion, slab_action_callback, handle_operation_error, allocator->thread_id, NULL); allocator->completion.requeue = false; - // Since we are going to dequeue all of the slabs, the open slab will - // become invalid, so clear it. + /* + * Since we are going to dequeue all of the slabs, the open slab will + * become invalid, so clear it. + */ allocator->open_slab = NULL; - // Ensure that we don't finish before we're done starting. + /* Ensure that we don't finish before we're done starting. */ allocator->slab_actor = (struct slab_actor) { .slab_action_count = 1, .callback = callback, @@ -622,94 +545,145 @@ static void apply_to_slabs(struct block_allocator *allocator, iterator = get_slab_iterator(allocator); while (vdo_has_next_slab(&iterator)) { const struct admin_state_code *operation = - get_vdo_admin_state_code(&allocator->state); + vdo_get_admin_state_code(&allocator->state); struct vdo_slab *slab = vdo_next_slab(&iterator); list_del_init(&slab->allocq_entry); allocator->slab_actor.slab_action_count++; - start_vdo_slab_action(slab, operation, &allocator->completion); + vdo_start_slab_action(slab, operation, &allocator->completion); } slab_action_callback(&allocator->completion); } -/** - * Inform the allocator that all load I/O has finished. - * - * @param completion The allocator completion - **/ static void finish_loading_allocator(struct vdo_completion *completion) { struct block_allocator *allocator = as_block_allocator(completion); const struct admin_state_code *operation = - get_vdo_admin_state_code(&allocator->state); + vdo_get_admin_state_code(&allocator->state); + + if (allocator->eraser != NULL) { + dm_kcopyd_client_destroy(UDS_FORGET(allocator->eraser)); + } if (operation == VDO_ADMIN_STATE_LOADING_FOR_RECOVERY) { void *context = - get_current_vdo_action_context(allocator->depot->action_manager); + vdo_get_current_action_context(allocator->depot->action_manager); vdo_replay_into_slab_journals(allocator, completion, context); return; } - finish_vdo_loading(&allocator->state); + vdo_finish_loading(&allocator->state); +} + +static void erase_next_slab_journal(struct block_allocator *allocator); + +static void copy_callback(int read_err, unsigned long write_err, void *context) +{ + struct block_allocator *allocator = context; + int result = (((read_err == 0) && (write_err == 0)) + ? VDO_SUCCESS : -EIO); + + if (result != VDO_SUCCESS) { + vdo_finish_completion(&allocator->completion, result); + return; + } + + erase_next_slab_journal(allocator); } /** - * Initiate a load. - * + * erase_next_slab_journal() - Erase the next slab journal. + */ +static void erase_next_slab_journal(struct block_allocator *allocator) +{ + struct vdo_slab *slab; + physical_block_number_t pbn; + struct dm_io_region regions[1]; + struct slab_depot *depot = allocator->depot; + block_count_t blocks = depot->slab_config.slab_journal_blocks; + + if (!vdo_has_next_slab(&allocator->slabs_to_erase)) { + vdo_finish_completion(&allocator->completion, VDO_SUCCESS); + return; + } + + slab = vdo_next_slab(&allocator->slabs_to_erase); + pbn = slab->journal_origin - depot->vdo->geometry.bio_offset; + regions[0] = (struct dm_io_region) { + .bdev = vdo_get_backing_device(depot->vdo), + .sector = pbn * VDO_SECTORS_PER_BLOCK, + .count = blocks * VDO_SECTORS_PER_BLOCK, + }; + dm_kcopyd_zero(allocator->eraser, 1, regions, 0, copy_callback, + allocator); +} + +/* * Implements vdo_admin_initiator. - **/ + */ static void initiate_load(struct admin_state *state) { struct block_allocator *allocator = container_of(state, struct block_allocator, state); - const struct admin_state_code *operation = get_vdo_admin_state_code(state); + const struct admin_state_code *operation = vdo_get_admin_state_code(state); if (operation == VDO_ADMIN_STATE_LOADING_FOR_REBUILD) { - prepare_vdo_completion(&allocator->completion, - finish_loading_allocator, - handle_operation_error, - allocator->thread_id, - NULL); - erase_vdo_slab_journals(allocator->depot, - get_slab_iterator(allocator), - &allocator->completion); + /* + * Must requeue because the kcopyd client cannot be freed in + * the same stack frame as the kcopyd callback, lest it + * deadlock. + */ + vdo_prepare_completion_for_requeue(&allocator->completion, + finish_loading_allocator, + handle_operation_error, + allocator->thread_id, + NULL); + allocator->eraser = dm_kcopyd_client_create(NULL); + if (allocator->eraser == NULL) { + vdo_finish_completion(&allocator->completion, -ENOMEM); + return; + } + allocator->slabs_to_erase = get_slab_iterator(allocator); + + erase_next_slab_journal(allocator); return; } apply_to_slabs(allocator, finish_loading_allocator); } -/**********************************************************************/ -void load_vdo_block_allocator(void *context, +/* + * Implements vdo_zone_action. + */ +void vdo_load_block_allocator(void *context, zone_count_t zone_number, struct vdo_completion *parent) { struct block_allocator *allocator = vdo_get_block_allocator_for_zone(context, zone_number); - start_vdo_loading( + vdo_start_loading( &allocator->state, - get_current_vdo_manager_operation(allocator->depot->action_manager), + vdo_get_current_manager_operation(allocator->depot->action_manager), parent, initiate_load); } -/**********************************************************************/ -void notify_vdo_slab_journals_are_recovered(struct block_allocator *allocator, +/* + * Inform a block allocator that its slab journals have been recovered from the + * recovery journal. + */ +void vdo_notify_slab_journals_are_recovered(struct block_allocator *allocator, int result) { - finish_vdo_loading_with_result(&allocator->state, result); + vdo_finish_loading_with_result(&allocator->state, result); } -/** +/* * Prepare slabs for allocation or scrubbing. - * - * @param allocator The allocator to prepare - * - * @return VDO_SUCCESS or an error code - **/ + */ static int __must_check -prepare_vdo_slabs_for_allocation(struct block_allocator *allocator) +vdo_prepare_slabs_for_allocation(struct block_allocator *allocator) { struct slab_status current_slab_status; struct heap heap; @@ -718,8 +692,9 @@ prepare_vdo_slabs_for_allocation(struct block_allocator *allocator) struct slab_depot *depot = allocator->depot; slab_count_t slab_count = depot->slab_count; - WRITE_ONCE(allocator->allocated_blocks, - get_data_block_count(allocator)); + block_count_t allocated_count + = (allocator->slab_count * depot->slab_config.data_blocks); + WRITE_ONCE(allocator->allocated_blocks, allocated_count); result = UDS_ALLOCATE(slab_count, struct slab_status, __func__, &slab_statuses); @@ -730,7 +705,7 @@ prepare_vdo_slabs_for_allocation(struct block_allocator *allocator) vdo_get_summarized_slab_statuses(allocator->summary, slab_count, slab_statuses); - // Sort the slabs by cleanliness, then by emptiness hint. + /* Sort the slabs by cleanliness, then by emptiness hint. */ initialize_heap(&heap, compare_slab_statuses, swap_slab_statuses, @@ -751,11 +726,11 @@ prepare_vdo_slabs_for_allocation(struct block_allocator *allocator) (!vdo_must_load_ref_counts(allocator->summary, slab->slab_number) && current_slab_status.is_clean)) { - queue_vdo_slab(slab); + vdo_queue_slab(slab); continue; } - mark_vdo_slab_unrecovered(slab); + vdo_mark_slab_unrecovered(slab); high_priority = ((current_slab_status.is_clean && (depot->load_type == VDO_SLAB_DEPOT_NORMAL_LOAD)) || vdo_slab_journal_requires_scrubbing(slab->journal)); @@ -768,28 +743,33 @@ prepare_vdo_slabs_for_allocation(struct block_allocator *allocator) return VDO_SUCCESS; } -/**********************************************************************/ -void prepare_vdo_block_allocator_to_allocate(void *context, +/* + * Implements vdo_zone_action. + */ +void vdo_prepare_block_allocator_to_allocate(void *context, zone_count_t zone_number, struct vdo_completion *parent) { struct block_allocator *allocator = vdo_get_block_allocator_for_zone(context, zone_number); - int result = prepare_vdo_slabs_for_allocation(allocator); + int result = vdo_prepare_slabs_for_allocation(allocator); + if (result != VDO_SUCCESS) { - finish_vdo_completion(parent, result); + vdo_finish_completion(parent, result); return; } - scrub_high_priority_vdo_slabs(allocator->slab_scrubber, + vdo_scrub_high_priority_slabs(allocator->slab_scrubber, is_priority_table_empty(allocator->prioritized_slabs), parent, - finish_vdo_completion_parent_callback, - finish_vdo_completion_parent_callback); + vdo_finish_completion_parent_callback, + vdo_finish_completion_parent_callback); } -/**********************************************************************/ -void register_new_vdo_slabs_for_allocator(void *context, +/* + * Implements vdo_zone_action. + */ +void vdo_register_new_slabs_for_allocator(void *context, zone_count_t zone_number, struct vdo_completion *parent) { @@ -797,32 +777,29 @@ void register_new_vdo_slabs_for_allocator(void *context, vdo_get_block_allocator_for_zone(context, zone_number); struct slab_depot *depot = allocator->depot; slab_count_t i; + for (i = depot->slab_count; i < depot->new_slab_count; i++) { struct vdo_slab *slab = depot->new_slabs[i]; + if (slab->allocator == allocator) { - register_vdo_slab_with_allocator(allocator, slab); + vdo_register_slab_with_allocator(allocator, slab); } } - complete_vdo_completion(parent); + vdo_complete_completion(parent); } -/** - * Perform a step in draining the allocator. This method is its own callback. - * - * @param completion The allocator's completion - **/ static void do_drain_step(struct vdo_completion *completion) { struct block_allocator *allocator = as_block_allocator(completion); - prepare_vdo_completion_for_requeue(&allocator->completion, + vdo_prepare_completion_for_requeue(&allocator->completion, do_drain_step, handle_operation_error, allocator->thread_id, NULL); switch (++allocator->drain_step) { case VDO_DRAIN_ALLOCATOR_STEP_SCRUBBER: - stop_vdo_slab_scrubbing(allocator->slab_scrubber, completion); + vdo_stop_slab_scrubbing(allocator->slab_scrubber, completion); return; case VDO_DRAIN_ALLOCATOR_STEP_SLABS: @@ -830,30 +807,28 @@ static void do_drain_step(struct vdo_completion *completion) return; case VDO_DRAIN_ALLOCATOR_STEP_SUMMARY: - drain_vdo_slab_summary_zone( + vdo_drain_slab_summary_zone( allocator->summary, - get_vdo_admin_state_code(&allocator->state), + vdo_get_admin_state_code(&allocator->state), completion); return; case VDO_DRAIN_ALLOCATOR_STEP_FINISHED: ASSERT_LOG_ONLY(!is_vio_pool_busy(allocator->vio_pool), "vio pool not busy"); - finish_vdo_draining_with_result(&allocator->state, + vdo_finish_draining_with_result(&allocator->state, completion->result); return; default: - finish_vdo_draining_with_result(&allocator->state, + vdo_finish_draining_with_result(&allocator->state, UDS_BAD_STATE); } } -/** - * Initiate a drain. - * +/* * Implements vdo_admin_initiator. - **/ + */ static void initiate_drain(struct admin_state *state) { struct block_allocator *allocator = @@ -862,37 +837,38 @@ static void initiate_drain(struct admin_state *state) do_drain_step(&allocator->completion); } -/**********************************************************************/ -void drain_vdo_block_allocator(void *context, +/* + * Drain all allocator I/O. Depending upon the type of drain, some or all + * dirty metadata may be written to disk. The type of drain will be determined + * from the state of the allocator's depot. + * + * Implements vdo_zone_action. + */ +void vdo_drain_block_allocator(void *context, zone_count_t zone_number, struct vdo_completion *parent) { struct block_allocator *allocator = vdo_get_block_allocator_for_zone(context, zone_number); - start_vdo_draining( + vdo_start_draining( &allocator->state, - get_current_vdo_manager_operation(allocator->depot->action_manager), + vdo_get_current_manager_operation(allocator->depot->action_manager), parent, initiate_drain); } -/** - * Perform a step in resuming a quiescent allocator. This method is its own - * callback. - * - * @param completion The allocator's completion - **/ static void do_resume_step(struct vdo_completion *completion) { struct block_allocator *allocator = as_block_allocator(completion); - prepare_vdo_completion_for_requeue(&allocator->completion, + + vdo_prepare_completion_for_requeue(&allocator->completion, do_resume_step, handle_operation_error, allocator->thread_id, NULL); switch (--allocator->drain_step) { case VDO_DRAIN_ALLOCATOR_STEP_SUMMARY: - resume_vdo_slab_summary_zone(allocator->summary, completion); + vdo_resume_slab_summary_zone(allocator->summary, completion); return; case VDO_DRAIN_ALLOCATOR_STEP_SLABS: @@ -900,25 +876,23 @@ static void do_resume_step(struct vdo_completion *completion) return; case VDO_DRAIN_ALLOCATOR_STEP_SCRUBBER: - resume_vdo_slab_scrubbing(allocator->slab_scrubber, completion); + vdo_resume_slab_scrubbing(allocator->slab_scrubber, completion); return; case VDO_DRAIN_ALLOCATOR_START: - finish_vdo_resuming_with_result(&allocator->state, + vdo_finish_resuming_with_result(&allocator->state, completion->result); return; default: - finish_vdo_resuming_with_result(&allocator->state, + vdo_finish_resuming_with_result(&allocator->state, UDS_BAD_STATE); } } -/** - * Initiate a resume. - * +/* * Implements vdo_admin_initiator. - **/ + */ static void initiate_resume(struct admin_state *state) { struct block_allocator *allocator = @@ -927,88 +901,75 @@ static void initiate_resume(struct admin_state *state) do_resume_step(&allocator->completion); } -/**********************************************************************/ -void resume_vdo_block_allocator(void *context, +/* + * Implements vdo_zone_action. + */ +void vdo_resume_block_allocator(void *context, zone_count_t zone_number, struct vdo_completion *parent) { struct block_allocator *allocator = vdo_get_block_allocator_for_zone(context, zone_number); - start_vdo_resuming(&allocator->state, - get_current_vdo_manager_operation(allocator->depot->action_manager), + vdo_start_resuming(&allocator->state, + vdo_get_current_manager_operation(allocator->depot->action_manager), parent, initiate_resume); } -/**********************************************************************/ -void release_vdo_tail_block_locks(void *context, +/* + * Request a commit of all dirty tail blocks which are locking the recovery + * journal block the depot is seeking to release. + * + * Implements vdo_zone_action. + */ +void vdo_release_tail_block_locks(void *context, zone_count_t zone_number, struct vdo_completion *parent) { struct block_allocator *allocator = vdo_get_block_allocator_for_zone(context, zone_number); struct list_head *list = &allocator->dirty_slab_journals; + while (!list_empty(list)) { if (!vdo_release_recovery_journal_lock(vdo_slab_journal_from_dirty_entry(list->next), allocator->depot->active_release_request)) { break; } } - complete_vdo_completion(parent); + vdo_complete_completion(parent); } -/**********************************************************************/ -struct slab_summary_zone * -get_vdo_slab_summary_zone(const struct block_allocator *allocator) -{ - return allocator->summary; -} - -/**********************************************************************/ -int acquire_vdo_block_allocator_vio(struct block_allocator *allocator, +int vdo_acquire_block_allocator_vio(struct block_allocator *allocator, struct waiter *waiter) { return acquire_vio_from_pool(allocator->vio_pool, waiter); } -/**********************************************************************/ -void return_vdo_block_allocator_vio(struct block_allocator *allocator, +void vdo_return_block_allocator_vio(struct block_allocator *allocator, struct vio_pool_entry *entry) { return_vio_to_pool(allocator->vio_pool, entry); } -/**********************************************************************/ -void scrub_all_unrecovered_vdo_slabs_in_zone(void *context, +/* + * Implements vdo_zone_action. + */ +void vdo_scrub_all_unrecovered_slabs_in_zone(void *context, zone_count_t zone_number, struct vdo_completion *parent) { struct block_allocator *allocator = vdo_get_block_allocator_for_zone(context, zone_number); - scrub_vdo_slabs(allocator->slab_scrubber, - allocator->depot, - vdo_notify_zone_finished_scrubbing, - noop_vdo_completion_callback); - complete_vdo_completion(parent); -} - -/**********************************************************************/ -int enqueue_for_clean_vdo_slab(struct block_allocator *allocator, - struct waiter *waiter) -{ - return enqueue_clean_vdo_slab_waiter(allocator->slab_scrubber, waiter); -} - -/**********************************************************************/ -void increase_vdo_slab_scrubbing_priority(struct vdo_slab *slab) -{ - vdo_register_slab_for_scrubbing(slab->allocator->slab_scrubber, slab, true); + vdo_scrub_slabs(allocator->slab_scrubber, + allocator->depot, + vdo_notify_zone_finished_scrubbing, + vdo_noop_completion_callback); + vdo_complete_completion(parent); } -/**********************************************************************/ struct block_allocator_statistics -get_vdo_block_allocator_statistics(const struct block_allocator *allocator) +vdo_get_block_allocator_statistics(const struct block_allocator *allocator) { const struct block_allocator_statistics *stats = &allocator->statistics; @@ -1019,9 +980,8 @@ get_vdo_block_allocator_statistics(const struct block_allocator *allocator) }; } -/**********************************************************************/ struct slab_journal_statistics -get_vdo_slab_journal_statistics(const struct block_allocator *allocator) +vdo_get_slab_journal_statistics(const struct block_allocator *allocator) { const struct slab_journal_statistics *stats = &allocator->slab_journal_statistics; @@ -1034,9 +994,8 @@ get_vdo_slab_journal_statistics(const struct block_allocator *allocator) }; } -/**********************************************************************/ struct ref_counts_statistics -get_vdo_ref_counts_statistics(const struct block_allocator *allocator) +vdo_get_ref_counts_statistics(const struct block_allocator *allocator) { const struct ref_counts_statistics *stats = &allocator->ref_counts_statistics; @@ -1045,23 +1004,25 @@ get_vdo_ref_counts_statistics(const struct block_allocator *allocator) }; } -/**********************************************************************/ -void dump_vdo_block_allocator(const struct block_allocator *allocator) +void vdo_dump_block_allocator(const struct block_allocator *allocator) { unsigned int pause_counter = 0; struct slab_iterator iterator = get_slab_iterator(allocator); + uds_log_info("block_allocator zone %u", allocator->zone_number); while (vdo_has_next_slab(&iterator)) { - dump_vdo_slab(vdo_next_slab(&iterator)); + vdo_dump_slab(vdo_next_slab(&iterator)); - // Wait for a while after each batch of 32 slabs dumped, - // allowing the kernel log a chance to be flushed instead of - // being overrun. + /* + * Wait for a while after each batch of 32 slabs dumped, an + * arbitrary number, allowing the kernel log a chance to be + * flushed instead of being overrun. + */ if (pause_counter++ == 31) { pause_counter = 0; uds_pause_for_logger(); } } - dump_vdo_slab_scrubber(allocator->slab_scrubber); + vdo_dump_slab_scrubber(allocator->slab_scrubber); } diff --git a/vdo/block-allocator.h b/vdo/block-allocator.h new file mode 100644 index 00000000..6e924f3c --- /dev/null +++ b/vdo/block-allocator.h @@ -0,0 +1,194 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright Red Hat + */ + +#ifndef BLOCK_ALLOCATOR_H +#define BLOCK_ALLOCATOR_H + +#include + +#include "admin-state.h" +#include "priority-table.h" +#include "slab-scrubber.h" +#include "slab-iterator.h" +#include "statistics.h" +#include "types.h" +#include "vio-pool.h" +#include "wait-queue.h" + +enum { + /* + * The number of vios in the vio pool is proportional to the throughput + * of the VDO. + */ + VIO_POOL_SIZE = 128, +}; + +enum block_allocator_drain_step { + VDO_DRAIN_ALLOCATOR_START, + VDO_DRAIN_ALLOCATOR_STEP_SCRUBBER, + VDO_DRAIN_ALLOCATOR_STEP_SLABS, + VDO_DRAIN_ALLOCATOR_STEP_SUMMARY, + VDO_DRAIN_ALLOCATOR_STEP_FINISHED, +}; + +/* + * A sub-structure for applying actions in parallel to all an allocator's + * slabs. + */ +struct slab_actor { + /* The number of slabs performing a slab action */ + slab_count_t slab_action_count; + /* The method to call when a slab action has been completed by all + * slabs + */ + vdo_action *callback; +}; + +struct block_allocator { + struct vdo_completion completion; + /* The slab depot for this allocator */ + struct slab_depot *depot; + /* The slab summary zone for this allocator */ + struct slab_summary_zone *summary; + /* The notifier for entering read-only mode */ + struct read_only_notifier *read_only_notifier; + /* The nonce of the VDO */ + nonce_t nonce; + /* The physical zone number of this allocator */ + zone_count_t zone_number; + /* The thread ID for this allocator's physical zone */ + thread_id_t thread_id; + /* The number of slabs in this allocator */ + slab_count_t slab_count; + /* The number of the last slab owned by this allocator */ + slab_count_t last_slab; + /* The reduced priority level used to preserve unopened slabs */ + unsigned int unopened_slab_priority; + /* The state of this allocator */ + struct admin_state state; + /* The actor for applying an action to all slabs */ + struct slab_actor slab_actor; + + /* The slab from which blocks are currently being allocated */ + struct vdo_slab *open_slab; + /* A priority queue containing all slabs available for allocation */ + struct priority_table *prioritized_slabs; + /* The slab scrubber */ + struct slab_scrubber *slab_scrubber; + /* What phase of the close operation the allocator is to perform */ + enum block_allocator_drain_step drain_step; + + /* + * These statistics are all mutated only by the physical zone thread, + * but are read by other threads when gathering statistics for the + * entire depot. + */ + /* + * The count of allocated blocks in this zone. Not in + * block_allocator_statistics for historical reasons. + */ + uint64_t allocated_blocks; + /* Statistics for this block allocator */ + struct block_allocator_statistics statistics; + /* Cumulative statistics for the slab journals in this zone */ + struct slab_journal_statistics slab_journal_statistics; + /* Cumulative statistics for the ref_counts in this zone */ + struct ref_counts_statistics ref_counts_statistics; + + /* + * This is the head of a queue of slab journals which have entries in + * their tail blocks which have not yet started to commit. When the + * recovery journal is under space pressure, slab journals which have + * uncommitted entries holding a lock on the recovery journal head are + * forced to commit their blocks early. This list is kept in order, + * with the tail containing the slab journal holding the most recent + * recovery journal lock. + */ + struct list_head dirty_slab_journals; + + /* The vio pool for reading and writing block allocator metadata */ + struct vio_pool *vio_pool; + /* The dm_kcopyd client for erasing slab journals */ + struct dm_kcopyd_client *eraser; + /* Iterator over the slabs to be erased */ + struct slab_iterator slabs_to_erase; +}; + +int __must_check +vdo_make_block_allocator(struct slab_depot *depot, + zone_count_t zone_number, + thread_id_t thread_id, + nonce_t nonce, + block_count_t vio_pool_size, + struct vdo *vdo, + struct read_only_notifier *read_only_notifier, + struct block_allocator **allocator_ptr); + +void vdo_free_block_allocator(struct block_allocator *allocator); + +void vdo_queue_slab(struct vdo_slab *slab); + +void vdo_adjust_free_block_count(struct vdo_slab *slab, bool increment); + +int __must_check vdo_allocate_block(struct block_allocator *allocator, + physical_block_number_t *block_number_ptr); + +void vdo_release_block_reference(struct block_allocator *allocator, + physical_block_number_t pbn, + const char *why); + +void vdo_load_block_allocator(void *context, + zone_count_t zone_number, + struct vdo_completion *parent); + +void vdo_notify_slab_journals_are_recovered(struct block_allocator *allocator, + int result); + +void vdo_prepare_block_allocator_to_allocate(void *context, + zone_count_t zone_number, + struct vdo_completion *parent); + +void vdo_register_slab_with_allocator(struct block_allocator *allocator, + struct vdo_slab *slab); + +void vdo_register_new_slabs_for_allocator(void *context, + zone_count_t zone_number, + struct vdo_completion *parent); + +void vdo_drain_block_allocator(void *context, + zone_count_t zone_number, + struct vdo_completion *parent); + +void vdo_resume_block_allocator(void *context, + zone_count_t zone_number, + struct vdo_completion *parent); + +void vdo_release_tail_block_locks(void *context, + zone_count_t zone_number, + struct vdo_completion *parent); + +int __must_check +vdo_acquire_block_allocator_vio(struct block_allocator *allocator, + struct waiter *waiter); + +void vdo_return_block_allocator_vio(struct block_allocator *allocator, + struct vio_pool_entry *entry); + +void vdo_scrub_all_unrecovered_slabs_in_zone(void *context, + zone_count_t zone_number, + struct vdo_completion *parent); + +struct block_allocator_statistics __must_check +vdo_get_block_allocator_statistics(const struct block_allocator *allocator); + +struct slab_journal_statistics __must_check +vdo_get_slab_journal_statistics(const struct block_allocator *allocator); + +struct ref_counts_statistics __must_check +vdo_get_ref_counts_statistics(const struct block_allocator *allocator); + +void vdo_dump_block_allocator(const struct block_allocator *allocator); + +#endif /* BLOCK_ALLOCATOR_H */ diff --git a/vdo/block-map-entry.h b/vdo/block-map-entry.h new file mode 100644 index 00000000..199d39cf --- /dev/null +++ b/vdo/block-map-entry.h @@ -0,0 +1,83 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright Red Hat + */ + +#ifndef BLOCK_MAP_ENTRY_H +#define BLOCK_MAP_ENTRY_H + +#include "block-mapping-state.h" +#include "constants.h" +#include "numeric.h" +#include "types.h" + +/** + * DOC: Block map entries + * + * The entry for each logical block in the block map is encoded into five + * bytes, which saves space in both the on-disk and in-memory layouts. It + * consists of the 36 low-order bits of a physical_block_number_t + * (addressing 256 terabytes with a 4KB block size) and a 4-bit encoding of a + * block_mapping_state. + * + * Of the 8 high bits of the 5-byte structure: + * + * Bits 7..4: The four highest bits of the 36-bit physical block + * number + * Bits 3..0: The 4-bit block_mapping_state + * + * The following 4 bytes are the low order bytes of the physical block number, + * in little-endian order. + * + * Conversion functions to and from a data location are provided. + */ +struct block_map_entry { +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + unsigned mapping_state : 4; + unsigned pbn_high_nibble : 4; +#else + unsigned pbn_high_nibble : 4; + unsigned mapping_state : 4; +#endif + + __le32 pbn_low_word; +} __packed; + +static inline struct data_location +vdo_unpack_block_map_entry(const struct block_map_entry *entry) +{ + physical_block_number_t low32 = __le32_to_cpu(entry->pbn_low_word); + physical_block_number_t high4 = entry->pbn_high_nibble; + + return (struct data_location) { + .pbn = ((high4 << 32) | low32), + .state = entry->mapping_state, + }; +} + +static inline bool vdo_is_mapped_location(const struct data_location *location) +{ + return (location->state != VDO_MAPPING_STATE_UNMAPPED); +} + +static inline bool vdo_is_valid_location(const struct data_location *location) +{ + if (location->pbn == VDO_ZERO_BLOCK) { + return !vdo_is_state_compressed(location->state); + } else { + return vdo_is_mapped_location(location); + } +} + +/* FIXME: maybe this should be vdo_pack_block_map_entry() */ +static inline struct block_map_entry +vdo_pack_pbn(physical_block_number_t pbn, enum block_mapping_state mapping_state) +{ + return (struct block_map_entry) { + .mapping_state = (mapping_state & 0x0F), + .pbn_high_nibble = ((pbn >> 32) & 0x0F), + .pbn_low_word = __cpu_to_le32(pbn & UINT_MAX), + }; +} + +#endif /* BLOCK_MAP_ENTRY_H */ diff --git a/vdo/blockMapFormat.c b/vdo/block-map-format.c similarity index 62% rename from vdo/blockMapFormat.c rename to vdo/block-map-format.c index 8d44b750..18d2353e 100644 --- a/vdo/blockMapFormat.c +++ b/vdo/block-map-format.c @@ -1,33 +1,17 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/blockMapFormat.c#11 $ */ -#include "blockMapFormat.h" +#include "block-map-format.h" #include "buffer.h" #include "permassert.h" #include "constants.h" #include "header.h" -#include "numUtils.h" -#include "statusCodes.h" +#include "num-utils.h" +#include "status-codes.h" #include "types.h" const struct header VDO_BLOCK_MAP_HEADER_2_0 = { @@ -39,27 +23,20 @@ const struct header VDO_BLOCK_MAP_HEADER_2_0 = { .size = sizeof(struct block_map_state_2_0), }; -/** - * Decode block map component state version 2.0 from a buffer. - * - * @param buffer A buffer positioned at the start of the encoding - * @param state The state structure to receive the decoded values - * - * @return UDS_SUCCESS or an error code - **/ -int decode_vdo_block_map_state_2_0(struct buffer *buffer, +int vdo_decode_block_map_state_2_0(struct buffer *buffer, struct block_map_state_2_0 *state) { size_t initial_length, decoded_size; block_count_t flat_page_count, root_count; physical_block_number_t flat_page_origin, root_origin; struct header header; - int result = decode_vdo_header(buffer, &header); + int result = vdo_decode_header(buffer, &header); + if (result != VDO_SUCCESS) { return result; } - result = validate_vdo_header(&VDO_BLOCK_MAP_HEADER_2_0, &header, true, + result = vdo_validate_header(&VDO_BLOCK_MAP_HEADER_2_0, &header, true, __func__); if (result != VDO_SUCCESS) { return result; @@ -119,18 +96,17 @@ int decode_vdo_block_map_state_2_0(struct buffer *buffer, return VDO_SUCCESS; } -/**********************************************************************/ -size_t get_vdo_block_map_encoded_size(void) +size_t vdo_get_block_map_encoded_size(void) { return VDO_ENCODED_HEADER_SIZE + sizeof(struct block_map_state_2_0); } -/**********************************************************************/ -int encode_vdo_block_map_state_2_0(struct block_map_state_2_0 state, +int vdo_encode_block_map_state_2_0(struct block_map_state_2_0 state, struct buffer *buffer) { size_t initial_length, encoded_size; - int result = encode_vdo_header(&VDO_BLOCK_MAP_HEADER_2_0, buffer); + int result = vdo_encode_header(&VDO_BLOCK_MAP_HEADER_2_0, buffer); + if (result != UDS_SUCCESS) { return result; } @@ -162,27 +138,34 @@ int encode_vdo_block_map_state_2_0(struct block_map_state_2_0 state, "encoded block map component size must match header size"); } -/**********************************************************************/ -page_count_t compute_vdo_block_map_page_count(block_count_t entries) +page_count_t vdo_compute_block_map_page_count(block_count_t entries) { - return compute_bucket_count(entries, VDO_BLOCK_MAP_ENTRIES_PER_PAGE); + return DIV_ROUND_UP(entries, VDO_BLOCK_MAP_ENTRIES_PER_PAGE); } -/**********************************************************************/ +/* + * Compute the number of pages which must be allocated at each level in order + * to grow the forest to a new number of entries. + * @entries: The new number of entries the block map must address + * + * @return: The total number of non-leaf pages required + */ block_count_t vdo_compute_new_forest_pages(root_count_t root_count, struct boundary *old_sizes, block_count_t entries, struct boundary *new_sizes) { page_count_t leaf_pages - = max(compute_vdo_block_map_page_count(entries), 1U); - page_count_t level_size = compute_bucket_count(leaf_pages, root_count); + = max(vdo_compute_block_map_page_count(entries), 1U); + page_count_t level_size = DIV_ROUND_UP(leaf_pages, root_count); block_count_t total_pages = 0; height_t height; + for (height = 0; height < VDO_BLOCK_MAP_TREE_HEIGHT; height++) { block_count_t new_pages; - level_size = compute_bucket_count(level_size, - VDO_BLOCK_MAP_ENTRIES_PER_PAGE); + + level_size = DIV_ROUND_UP(level_size, + VDO_BLOCK_MAP_ENTRIES_PER_PAGE); new_sizes->levels[height] = level_size; new_pages = level_size; if (old_sizes != NULL) { diff --git a/vdo/block-map-format.h b/vdo/block-map-format.h new file mode 100644 index 00000000..d63a9cf1 --- /dev/null +++ b/vdo/block-map-format.h @@ -0,0 +1,46 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright Red Hat + */ + +#ifndef BLOCK_MAP_FORMAT_H +#define BLOCK_MAP_FORMAT_H + +#include "buffer.h" + +#include "constants.h" +#include "header.h" +#include "types.h" + +struct block_map_state_2_0 { + physical_block_number_t flat_page_origin; + block_count_t flat_page_count; + physical_block_number_t root_origin; + block_count_t root_count; +} __packed; + +struct boundary { + page_number_t levels[VDO_BLOCK_MAP_TREE_HEIGHT]; +}; + +extern const struct header VDO_BLOCK_MAP_HEADER_2_0; + +int __must_check +vdo_decode_block_map_state_2_0(struct buffer *buffer, + struct block_map_state_2_0 *state); + +size_t __must_check vdo_get_block_map_encoded_size(void); + +int __must_check +vdo_encode_block_map_state_2_0(struct block_map_state_2_0 state, + struct buffer *buffer); + +page_count_t vdo_compute_block_map_page_count(block_count_t entries); + +block_count_t __must_check +vdo_compute_new_forest_pages(root_count_t root_count, + struct boundary *old_sizes, + block_count_t entries, + struct boundary *new_sizes); + +#endif /* BLOCK_MAP_FORMAT_H */ diff --git a/vdo/block-map-page.c b/vdo/block-map-page.c new file mode 100644 index 00000000..cb109df8 --- /dev/null +++ b/vdo/block-map-page.c @@ -0,0 +1,58 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright Red Hat + */ + +#include "block-map-page.h" + +#include "permassert.h" + +#include "constants.h" +#include "status-codes.h" +#include "types.h" + +enum { + PAGE_HEADER_4_1_SIZE = 8 + 8 + 8 + 1 + 1 + 1 + 1, +}; + +static const struct version_number BLOCK_MAP_4_1 = { + .major_version = 4, + .minor_version = 1, +}; + +struct block_map_page *vdo_format_block_map_page(void *buffer, + nonce_t nonce, + physical_block_number_t pbn, + bool initialized) +{ + struct block_map_page *page = (struct block_map_page *) buffer; + + memset(buffer, 0, VDO_BLOCK_SIZE); + page->version = vdo_pack_version_number(BLOCK_MAP_4_1); + page->header.nonce = __cpu_to_le64(nonce); + page->header.pbn = __cpu_to_le64(pbn); + page->header.initialized = initialized; + return page; +} + +enum block_map_page_validity +vdo_validate_block_map_page(struct block_map_page *page, + nonce_t nonce, + physical_block_number_t pbn) +{ + STATIC_ASSERT_SIZEOF(struct block_map_page_header, + PAGE_HEADER_4_1_SIZE); + + if (!vdo_are_same_version(BLOCK_MAP_4_1, + vdo_unpack_version_number(page->version)) || + !vdo_is_block_map_page_initialized(page) || + (nonce != __le64_to_cpu(page->header.nonce))) { + return VDO_BLOCK_MAP_PAGE_INVALID; + } + + if (pbn != vdo_get_block_map_page_pbn(page)) { + return VDO_BLOCK_MAP_PAGE_BAD; + } + + return VDO_BLOCK_MAP_PAGE_VALID; +} diff --git a/vdo/block-map-page.h b/vdo/block-map-page.h new file mode 100644 index 00000000..ee9c013c --- /dev/null +++ b/vdo/block-map-page.h @@ -0,0 +1,80 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright Red Hat + */ + +#ifndef BLOCK_MAP_PAGE_H +#define BLOCK_MAP_PAGE_H + +#include "numeric.h" + +#include "block-map-entry.h" +#include "header.h" +#include "types.h" + +struct block_map_page_header { + __le64 nonce; + __le64 pbn; + + /** May be non-zero on disk */ + byte unused_long_word[8]; + + /* Whether this page has been written twice to disk */ + bool initialized; + + /* Always zero on disk */ + byte unused_byte1; + + /* May be non-zero on disk */ + byte unused_byte2; + byte unused_byte3; +} __packed; + +struct block_map_page { + struct packed_version_number version; + struct block_map_page_header header; + struct block_map_entry entries[]; +} __packed; + +enum block_map_page_validity { + VDO_BLOCK_MAP_PAGE_VALID, + VDO_BLOCK_MAP_PAGE_INVALID, + /* Valid page found in the wrong location on disk */ + VDO_BLOCK_MAP_PAGE_BAD, +}; + +static inline bool __must_check +vdo_is_block_map_page_initialized(const struct block_map_page *page) +{ + return page->header.initialized; +} + +static inline bool +vdo_mark_block_map_page_initialized(struct block_map_page *page, + bool initialized) +{ + if (initialized == page->header.initialized) { + return false; + } + + page->header.initialized = initialized; + return true; +} + +static inline physical_block_number_t __must_check +vdo_get_block_map_page_pbn(const struct block_map_page *page) +{ + return __le64_to_cpu(page->header.pbn); +} + +struct block_map_page *vdo_format_block_map_page(void *buffer, + nonce_t nonce, + physical_block_number_t pbn, + bool initialized); + +enum block_map_page_validity __must_check +vdo_validate_block_map_page(struct block_map_page *page, + nonce_t nonce, + physical_block_number_t pbn); + +#endif /* BLOCK_MAP_PAGE_H */ diff --git a/vdo/blockMapRecovery.c b/vdo/block-map-recovery.c similarity index 61% rename from vdo/blockMapRecovery.c rename to vdo/block-map-recovery.c index e66118f7..8ccdab25 100644 --- a/vdo/blockMapRecovery.c +++ b/vdo/block-map-recovery.c @@ -1,90 +1,69 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/blockMapRecovery.c#28 $ */ -#include "blockMapRecovery.h" +#include "block-map-recovery.h" #include "logger.h" -#include "memoryAlloc.h" +#include "memory-alloc.h" #include "permassert.h" -#include "blockMapInternals.h" -#include "blockMapPage.h" +#include "block-map.h" +#include "block-map-page.h" #include "heap.h" -#include "numUtils.h" -#include "refCounts.h" -#include "slabDepot.h" +#include "num-utils.h" +#include "ref-counts.h" +#include "slab-depot.h" +#include "thread-config.h" #include "types.h" -#include "vdoInternal.h" -#include "vdoPageCache.h" +#include "vdo.h" +#include "vdo-page-cache.h" -/** - * A completion to manage recovering the block map from the recovery journal. +/* + * A structure to manage recovering the block map from the recovery journal. + * * Note that the page completions kept in this structure are not immediately * freed, so the corresponding pages will be locked down in the page cache * until the recovery frees them. **/ struct block_map_recovery_completion { - /** completion header */ struct vdo_completion completion; - /** the completion for flushing the block map */ + /* for flushing the block map */ struct vdo_completion sub_task_completion; - /** the thread from which the block map may be flushed */ + /* the thread from which the block map may be flushed */ thread_id_t admin_thread; - /** the thread on which all block map operations must be done */ + /* the thread on which all block map operations must be done */ thread_id_t logical_thread_id; - /** the block map */ struct block_map *block_map; - /** whether this recovery has been aborted */ + /* whether this recovery has been aborted */ bool aborted; - /** whether we are currently launching the initial round of requests */ bool launching; - // Fields for the journal entries. - /** the journal entries to apply */ + /* Fields for the journal entries. */ struct numbered_block_mapping *journal_entries; - /** + /* * a heap wrapping journal_entries. It re-orders and sorts journal * entries in ascending LBN order, then original journal order. This * permits efficient iteration over the journal entries in order. - **/ + */ struct heap replay_heap; - // Fields tracking progress through the journal entries. - /** a pointer to the next journal entry to apply */ + /* Fields tracking progress through the journal entries. */ + struct numbered_block_mapping *current_entry; - /** next entry for which the block map page has not been requested */ + /* next entry for which the block map page has not been requested */ struct numbered_block_mapping *current_unfetched_entry; - // Fields tracking requested pages. - /** the absolute PBN of the current page being processed */ + /* Fields tracking requested pages. */ + /* current page's absolute PBN */ physical_block_number_t pbn; - /** number of pending (non-ready) requests */ page_count_t outstanding; - /** number of page completions */ page_count_t page_count; - /** array of requested, potentially ready page completions */ struct vdo_page_completion page_completions[]; }; -/** +/* * This is a heap_comparator function that orders numbered_block_mappings using * the 'block_map_slot' field as the primary key and the mapping 'number' field * as the secondary key. Using the mapping number preserves the journal order @@ -92,11 +71,11 @@ struct block_map_recovery_completion { * ensuring we replay all entries with the same slot in the exact order as they * appeared in the journal. * - *

The comparator order is reversed from the usual sense since the + * The comparator order is reversed from the usual sense since the * heap structure is a max-heap, returning larger elements before * smaller ones, but we want to pop entries off the heap in ascending * LBN order. - **/ + */ static int compare_mappings(const void *item1, const void *item2) { const struct numbered_block_mapping *mapping1 = @@ -121,9 +100,9 @@ static int compare_mappings(const void *item1, const void *item2) return 0; } -/** - * Swap two numbered_block_mapping structures. Implements heap_swapper. - **/ +/* + * Implements heap_swapper. + */ static void swap_mappings(void *item1, void *item2) { struct numbered_block_mapping *mapping1 = item1; @@ -133,60 +112,34 @@ static void swap_mappings(void *item1, void *item2) *mapping2 = temp; } -/** - * Convert a vdo_completion to a block_map_recovery_completion. - * - * @param completion The completion to convert - * - * @return The completion as a block_map_recovery_completion - **/ static inline struct block_map_recovery_completion * __must_check as_block_map_recovery_completion(struct vdo_completion *completion) { - assert_vdo_completion_type(completion->type, + vdo_assert_completion_type(completion->type, VDO_BLOCK_MAP_RECOVERY_COMPLETION); return container_of(completion, struct block_map_recovery_completion, completion); } -/** - * Free the block_map_recovery_completion and notify the parent that the - * block map recovery is done. This callback is registered in - * make_vdo_recovery_completion(). - * - * @param completion The block_map_recovery_completion - **/ static void finish_block_map_recovery(struct vdo_completion *completion) { int result = completion->result; struct vdo_completion *parent = completion->parent; + UDS_FREE(as_block_map_recovery_completion(UDS_FORGET(completion))); - finish_vdo_completion(parent, result); + vdo_finish_completion(parent, result); } -/** - * Make a new block map recovery completion. - * - * @param [in] vdo The vdo - * @param [in] entry_count The number of journal entries - * @param [in] journal_entries An array of journal entries to process - * @param [in] parent The parent of the recovery completion - * @param [out] recovery_ptr The new block map recovery completion - * - * @return a success or error code - **/ static int -make_vdo_recovery_completion(struct vdo *vdo, +vdo_make_recovery_completion(struct vdo *vdo, block_count_t entry_count, struct numbered_block_mapping *journal_entries, struct vdo_completion *parent, struct block_map_recovery_completion **recovery_ptr) { - const struct thread_config *thread_config = get_vdo_thread_config(vdo); - struct block_map *block_map = get_block_map(vdo); page_count_t page_count = - min(get_vdo_configured_cache_size(vdo) >> 1, + min(vdo->device_config->cache_size >> 1, (page_count_t) MAXIMUM_SIMULTANEOUS_VDO_BLOCK_MAP_RESTORATION_READS); struct block_map_recovery_completion *recovery; @@ -199,22 +152,24 @@ make_vdo_recovery_completion(struct vdo *vdo, return result; } - initialize_vdo_completion(&recovery->completion, vdo, + vdo_initialize_completion(&recovery->completion, vdo, VDO_BLOCK_MAP_RECOVERY_COMPLETION); - initialize_vdo_completion(&recovery->sub_task_completion, vdo, + vdo_initialize_completion(&recovery->sub_task_completion, vdo, VDO_SUB_TASK_COMPLETION); - recovery->block_map = block_map; + recovery->block_map = vdo->block_map; recovery->journal_entries = journal_entries; recovery->page_count = page_count; recovery->current_entry = &recovery->journal_entries[entry_count - 1]; - recovery->admin_thread = thread_config->admin_thread; + recovery->admin_thread = vdo->thread_config->admin_thread; recovery->logical_thread_id = - vdo_get_logical_zone_thread(thread_config, 0); + vdo_get_logical_zone_thread(vdo->thread_config, 0); - // Organize the journal entries into a binary heap so we can iterate - // over them in sorted order incrementally, avoiding an expensive sort - // call. + /* + * Organize the journal entries into a binary heap so we can iterate + * over them in sorted order incrementally, avoiding an expensive sort + * call. + */ initialize_heap(&recovery->replay_heap, compare_mappings, swap_mappings, @@ -229,13 +184,12 @@ make_vdo_recovery_completion(struct vdo *vdo, __func__, recovery->logical_thread_id, vdo_get_callback_thread_id()); - prepare_vdo_completion(&recovery->completion, + vdo_prepare_completion(&recovery->completion, finish_block_map_recovery, finish_block_map_recovery, recovery->logical_thread_id, parent); - // This message must be recognizable by VDOTest::RebuildBase. uds_log_info("Replaying %zu recovery entries into block map", recovery->replay_heap.count); @@ -243,7 +197,6 @@ make_vdo_recovery_completion(struct vdo *vdo, return VDO_SUCCESS; } -/**********************************************************************/ static void flush_block_map(struct vdo_completion *completion) { struct block_map_recovery_completion *recovery = @@ -253,24 +206,16 @@ static void flush_block_map(struct vdo_completion *completion) recovery->admin_thread), "flush_block_map() called on admin thread"); - prepare_vdo_completion_to_finish_parent(completion, completion->parent); - drain_vdo_block_map(recovery->block_map, + vdo_prepare_completion_to_finish_parent(completion, completion->parent); + vdo_drain_block_map(recovery->block_map, VDO_ADMIN_STATE_RECOVERING, completion); } -/** - * Check whether the recovery is done. If so, finish it by either flushing the - * block map (if the recovery was successful), or by cleaning up (if it - * wasn't). - * - * @param recovery The recovery completion - * - * @return true if the recovery or recovery is complete - **/ +/* @return true if recovery is done. */ static bool finish_if_done(struct block_map_recovery_completion *recovery) { - // Pages are still being launched or there is still work to do + /* Pages are still being launched or there is still work to do */ if (recovery->launching || (recovery->outstanding > 0) || (!recovery->aborted && (recovery->current_entry >= recovery->journal_entries))) { @@ -284,16 +229,17 @@ static bool finish_if_done(struct block_map_recovery_completion *recovery) * through the ready ones. */ size_t i; + for (i = 0; i < recovery->page_count; i++) { struct vdo_page_completion *page_completion = &recovery->page_completions[i]; if (recovery->page_completions[i].ready) { - release_vdo_page_completion(&page_completion->completion); + vdo_release_page_completion(&page_completion->completion); } } - complete_vdo_completion(&recovery->completion); + vdo_complete_completion(&recovery->completion); } else { - launch_vdo_completion_callback_with_parent(&recovery->sub_task_completion, + vdo_launch_completion_callback_with_parent(&recovery->sub_task_completion, flush_block_map, recovery->admin_thread, &recovery->completion); @@ -302,47 +248,41 @@ static bool finish_if_done(struct block_map_recovery_completion *recovery) return true; } -/** - * Note that there has been an error during the recovery and finish it if there - * is nothing else outstanding. - * - * @param recovery The block_map_recovery_completion - * @param result The error result to use, if one is not already saved - **/ static void abort_recovery(struct block_map_recovery_completion *recovery, int result) { recovery->aborted = true; - set_vdo_completion_result(&recovery->completion, result); + vdo_set_completion_result(&recovery->completion, result); finish_if_done(recovery); } -/** +/* * Find the first journal entry after a given entry which is not on the same * block map page. * - * @param recovery the block_map_recovery_completion - * @param current_entry the entry to search from - * @param needs_sort Whether sorting is needed to proceed + * @current_entry: the entry to search from + * @needs_sort: Whether sorting is needed to proceed * * @return Pointer to the first later journal entry on a different block map * page, or a pointer to just before the journal entries if no * subsequent entry is on a different block map page. - **/ + */ static struct numbered_block_mapping * find_entry_starting_next_page(struct block_map_recovery_completion *recovery, struct numbered_block_mapping *current_entry, bool needs_sort) { size_t current_page; - // If current_entry is invalid, return immediately. + /* If current_entry is invalid, return immediately. */ if (current_entry < recovery->journal_entries) { return current_entry; } current_page = current_entry->block_map_slot.pbn; - // Decrement current_entry until it's out of bounds or on a different - // page. + /* + * Decrement current_entry until it's out of bounds or on a different + * page. + */ while ((current_entry >= recovery->journal_entries) && (current_entry->block_map_slot.pbn == current_page)) { if (needs_sort) { @@ -356,19 +296,17 @@ find_entry_starting_next_page(struct block_map_recovery_completion *recovery, return current_entry; } -/** - * Apply a range of journal entries to a block map page. - * - * @param page The block map page being modified - * @param starting_entry The first journal entry to apply - * @param ending_entry The entry just past the last journal entry to apply - **/ +/* + * Apply a range of journal entries [starting_entry, ending_entry) journal + * entries to a block map page. + */ static void apply_journal_entries_to_page(struct block_map_page *page, struct numbered_block_mapping *starting_entry, struct numbered_block_mapping *ending_entry) { struct numbered_block_mapping *current_entry = starting_entry; + while (current_entry != ending_entry) { page->entries[current_entry->block_map_slot.slot] = current_entry->block_map_entry; @@ -376,16 +314,9 @@ apply_journal_entries_to_page(struct block_map_page *page, } } -/**********************************************************************/ static void recover_ready_pages(struct block_map_recovery_completion *recovery, struct vdo_completion *completion); -/** - * Note that a page is now ready and attempt to process pages. This callback is - * registered in fetch_page(). - * - * @param completion The vdo_page_completion for the fetched page - **/ static void page_loaded(struct vdo_completion *completion) { struct block_map_recovery_completion *recovery = @@ -396,11 +327,6 @@ static void page_loaded(struct vdo_completion *completion) } } -/** - * Handle an error loading a page. - * - * @param completion The vdo_page_completion - **/ static void handle_page_load_error(struct vdo_completion *completion) { struct block_map_recovery_completion *recovery = @@ -409,28 +335,23 @@ static void handle_page_load_error(struct vdo_completion *completion) abort_recovery(recovery, completion->result); } -/** - * Fetch a page from the block map. - * - * @param recovery the block_map_recovery_completion - * @param completion the page completion to use - **/ static void fetch_page(struct block_map_recovery_completion *recovery, struct vdo_completion *completion) { physical_block_number_t new_pbn; + if (recovery->current_unfetched_entry < recovery->journal_entries) { - // Nothing left to fetch. + /* Nothing left to fetch. */ return; } - // Fetch the next page we haven't yet requested. + /* Fetch the next page we haven't yet requested. */ new_pbn = recovery->current_unfetched_entry->block_map_slot.pbn; recovery->current_unfetched_entry = find_entry_starting_next_page(recovery, recovery->current_unfetched_entry, true); - init_vdo_page_completion(((struct vdo_page_completion *) completion), + vdo_init_page_completion(((struct vdo_page_completion *) completion), recovery->block_map->zones[0].page_cache, new_pbn, true, @@ -438,18 +359,9 @@ static void fetch_page(struct block_map_recovery_completion *recovery, page_loaded, handle_page_load_error); recovery->outstanding++; - get_vdo_page(completion); + vdo_get_page(completion); } -/** - * Get the next page completion to process. If it isn't ready, we'll try again - * when it is. - * - * @param recovery The recovery completion - * @param completion The current page completion - * - * @return The next page completion to process - **/ static struct vdo_page_completion * get_next_page_completion(struct block_map_recovery_completion *recovery, struct vdo_page_completion *completion) @@ -461,12 +373,6 @@ get_next_page_completion(struct block_map_recovery_completion *recovery, return completion; } -/** - * Recover from as many pages as possible. - * - * @param recovery The recovery completion - * @param completion The first page completion to process - **/ static void recover_ready_pages(struct block_map_recovery_completion *recovery, struct vdo_completion *completion) { @@ -484,8 +390,9 @@ static void recover_ready_pages(struct block_map_recovery_completion *recovery, while (page_completion->ready) { struct numbered_block_mapping *start_of_next_page; struct block_map_page *page = - dereference_writable_vdo_page(completion); + vdo_dereference_writable_page(completion); int result = ASSERT(page != NULL, "page available"); + if (result != VDO_SUCCESS) { abort_recovery(recovery, result); return; @@ -499,8 +406,8 @@ static void recover_ready_pages(struct block_map_recovery_completion *recovery, recovery->current_entry, start_of_next_page); recovery->current_entry = start_of_next_page; - request_vdo_page_write(completion); - release_vdo_page_completion(completion); + vdo_request_page_write(completion); + vdo_release_page_completion(completion); if (finish_if_done(recovery)) { return; @@ -514,8 +421,10 @@ static void recover_ready_pages(struct block_map_recovery_completion *recovery, } } -/**********************************************************************/ -void recover_vdo_block_map(struct vdo *vdo, +/* + * Recover the block map (normal rebuild). + */ +void vdo_recover_block_map(struct vdo *vdo, block_count_t entry_count, struct numbered_block_mapping *journal_entries, struct vdo_completion *parent) @@ -524,16 +433,16 @@ void recover_vdo_block_map(struct vdo *vdo, page_count_t i; struct block_map_recovery_completion *recovery; - int result = make_vdo_recovery_completion(vdo, entry_count, + int result = vdo_make_recovery_completion(vdo, entry_count, journal_entries, parent, &recovery); if (result != VDO_SUCCESS) { - finish_vdo_completion(parent, result); + vdo_finish_completion(parent, result); return; } if (is_heap_empty(&recovery->replay_heap)) { - finish_vdo_completion(&recovery->completion, VDO_SUCCESS); + vdo_finish_completion(&recovery->completion, VDO_SUCCESS); return; } @@ -541,8 +450,10 @@ void recover_vdo_block_map(struct vdo *vdo, ASSERT_LOG_ONLY(first_sorted_entry == recovery->current_entry, "heap is returning elements in an unexpected order"); - // Prevent any page from being processed until all pages have been - // launched. + /* + * Prevent any page from being processed until all pages have been + * launched. + */ recovery->launching = true; recovery->pbn = recovery->current_entry->block_map_slot.pbn; recovery->current_unfetched_entry = recovery->current_entry; @@ -556,6 +467,6 @@ void recover_vdo_block_map(struct vdo *vdo, } recovery->launching = false; - // Process any ready pages. + /* Process any ready pages. */ recover_ready_pages(recovery, &recovery->page_completions[0].completion); } diff --git a/vdo/block-map-recovery.h b/vdo/block-map-recovery.h new file mode 100644 index 00000000..7a80ce7b --- /dev/null +++ b/vdo/block-map-recovery.h @@ -0,0 +1,30 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright Red Hat + */ + +#ifndef BLOCK_MAP_RECOVERY_H +#define BLOCK_MAP_RECOVERY_H + +#include "block-map.h" +#include "block-mapping-state.h" +#include "types.h" + +/* + * An explicitly numbered block mapping. Numbering the mappings allows them to + * be sorted by logical block number during recovery while still preserving + * the relative order of journal entries with the same logical block number. + */ +struct numbered_block_mapping { + struct block_map_slot block_map_slot; + struct block_map_entry block_map_entry; + /* A serial number to use during replay */ + uint32_t number; +} __packed; + +void vdo_recover_block_map(struct vdo *vdo, + block_count_t entry_count, + struct numbered_block_mapping *journal_entries, + struct vdo_completion *parent); + +#endif /* BLOCK_MAP_RECOVERY_H */ diff --git a/vdo/blockMapTree.c b/vdo/block-map-tree.c similarity index 58% rename from vdo/blockMapTree.c rename to vdo/block-map-tree.c index 1880eb58..52515ca4 100644 --- a/vdo/blockMapTree.c +++ b/vdo/block-map-tree.c @@ -1,47 +1,35 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/blockMapTree.c#50 $ */ -#include "blockMapTree.h" +#include "block-map-tree.h" + +#include #include "logger.h" -#include "memoryAlloc.h" +#include "memory-alloc.h" #include "permassert.h" -#include "blockMap.h" -#include "blockMapInternals.h" -#include "blockMapPage.h" -#include "blockMapTreeInternals.h" +#include "block-map.h" +#include "block-map-page.h" #include "constants.h" -#include "dataVIO.h" -#include "dirtyLists.h" +#include "data-vio.h" +#include "dirty-lists.h" #include "forest.h" -#include "numUtils.h" -#include "recoveryJournal.h" -#include "referenceOperation.h" -#include "slabDepot.h" -#include "slabJournal.h" +#include "io-submitter.h" +#include "kernel-types.h" +#include "num-utils.h" +#include "physical-zone.h" +#include "recovery-journal.h" +#include "reference-operation.h" +#include "slab-depot.h" +#include "slab-journal.h" #include "types.h" -#include "vdoInternal.h" -#include "vdoPageCache.h" -#include "vioPool.h" +#include "vdo.h" +#include "vdo-page-cache.h" +#include "vio.h" +#include "vio-pool.h" enum { BLOCK_MAP_VIO_POOL_SIZE = 64, @@ -54,44 +42,34 @@ struct page_descriptor { slot_number_t slot; } __packed; -typedef union { +union page_key { struct page_descriptor descriptor; uint64_t key; -} page_key; +}; struct write_if_not_dirtied_context { struct block_map_tree_zone *zone; uint8_t generation; }; -/** - * An invalid PBN used to indicate that the page holding the location of a - * tree root has been "loaded". - **/ +/* + * Used to indicate that the page holding the location of a tree root has been + * "loaded". + */ const physical_block_number_t VDO_INVALID_PBN = 0xFFFFFFFFFFFFFFFF; -/** - * Convert a list entry to a tree_page. - * - * @param entry The list entry to convert - * - * @return The tree_page which owns the list entry - **/ static inline struct tree_page * tree_page_from_list_entry(struct list_head *entry) { return list_entry(entry, struct tree_page, entry); } -/**********************************************************************/ static void write_dirty_pages_callback(struct list_head *expired, void *context); -/** - * Make vios for reading, writing, and allocating the arboreal block map. - * +/* * Implements vio_constructor. - **/ + */ static int __must_check make_block_map_vios(struct vdo *vdo, void *parent, @@ -106,7 +84,6 @@ make_block_map_vios(struct vdo *vdo, vio_ptr); } -/**********************************************************************/ int vdo_initialize_tree_zone(struct block_map_zone *zone, struct vdo *vdo, block_count_t era_length) @@ -114,10 +91,11 @@ int vdo_initialize_tree_zone(struct block_map_zone *zone, int result; struct block_map_tree_zone *tree_zone = &zone->tree_zone; + STATIC_ASSERT_SIZEOF(struct page_descriptor, sizeof(uint64_t)); tree_zone->map_zone = zone; - result = make_vdo_dirty_lists(era_length, write_dirty_pages_callback, + result = vdo_make_dirty_lists(era_length, write_dirty_pages_callback, tree_zone, &tree_zone->dirty_lists); if (result != VDO_SUCCESS) { return result; @@ -137,21 +115,7 @@ int vdo_initialize_tree_zone(struct block_map_zone *zone, &tree_zone->vio_pool); } -/**********************************************************************/ -int vdo_replace_tree_zone_vio_pool(struct block_map_tree_zone *zone, - struct vdo *vdo, - size_t pool_size) -{ - free_vio_pool(UDS_FORGET(zone->vio_pool)); - return make_vio_pool(vdo, - pool_size, - zone->map_zone->thread_id, - make_block_map_vios, - zone, - &zone->vio_pool); -} -/**********************************************************************/ void vdo_uninitialize_block_map_tree_zone(struct block_map_tree_zone *tree_zone) { UDS_FREE(UDS_FORGET(tree_zone->dirty_lists)); @@ -159,52 +123,41 @@ void vdo_uninitialize_block_map_tree_zone(struct block_map_tree_zone *tree_zone) free_int_map(UDS_FORGET(tree_zone->loading_pages)); } -/**********************************************************************/ void vdo_set_tree_zone_initial_period(struct block_map_tree_zone *tree_zone, sequence_number_t period) { - set_vdo_dirty_lists_current_period(tree_zone->dirty_lists, period); + vdo_set_dirty_lists_current_period(tree_zone->dirty_lists, period); } -/** - * Get the block_map_tree_zone in which a data_vio is operating. - * - * @param data_vio The data_vio - * - * @return The block_map_tree_zone - **/ static inline struct block_map_tree_zone * __must_check get_block_map_tree_zone(struct data_vio *data_vio) { - return &(get_vdo_logical_zone_block_map(data_vio->logical.zone)->tree_zone); + return &(data_vio->logical.zone->block_map_zone->tree_zone); } -/** - * Get the tree_page for a given lock. This will be the page referred to by the - * lock's tree slot for the lock's current height. - * - * @param zone The tree zone of the tree - * @param lock The lock describing the page to get - * - * @return The requested page - **/ +/* + * Get the page referred to by the lock's tree slot at its current height. + */ static inline struct tree_page * get_tree_page(const struct block_map_tree_zone *zone, const struct tree_lock *lock) { - return get_vdo_tree_page_by_index(zone->map_zone->block_map->forest, + return vdo_get_tree_page_by_index(zone->map_zone->block_map->forest, lock->root_index, lock->height, lock->tree_slots[lock->height].page_index); } -/**********************************************************************/ +/* + * Validate and copy a buffer to a page. + * @pbn: the expected PBN + */ bool vdo_copy_valid_page(char *buffer, nonce_t nonce, physical_block_number_t pbn, struct block_map_page *page) { struct block_map_page *loaded = (struct block_map_page *) buffer; enum block_map_page_validity validity = - validate_vdo_block_map_page(loaded, nonce, pbn); + vdo_validate_block_map_page(loaded, nonce, pbn); if (validity == VDO_BLOCK_MAP_PAGE_VALID) { memcpy(page, loaded, VDO_BLOCK_SIZE); return true; @@ -214,13 +167,12 @@ bool vdo_copy_valid_page(char *buffer, nonce_t nonce, uds_log_error_strerror(VDO_BAD_PAGE, "Expected page %llu but got page %llu instead", (unsigned long long) pbn, - (unsigned long long) get_vdo_block_map_page_pbn(loaded)); + (unsigned long long) vdo_get_block_map_page_pbn(loaded)); } return false; } -/**********************************************************************/ bool vdo_is_tree_zone_active(struct block_map_tree_zone *zone) { return ((zone->active_lookups != 0) || @@ -228,37 +180,56 @@ bool vdo_is_tree_zone_active(struct block_map_tree_zone *zone) is_vio_pool_busy(zone->vio_pool)); } -/** - * Put the vdo in read-only mode and wake any vios waiting for a flush. - * - * @param zone The zone - * @param result The error which is causing read-only mode - **/ static void enter_zone_read_only_mode(struct block_map_tree_zone *zone, int result) { vdo_enter_read_only_mode(zone->map_zone->read_only_notifier, result); - // We are in read-only mode, so we won't ever write any page out. Just - // take all waiters off the queue so the tree zone can be closed. + /* + * We are in read-only mode, so we won't ever write any page out. Just + * take all waiters off the queue so the tree zone can be closed. + */ while (has_waiters(&zone->flush_waiters)) { dequeue_next_waiter(&zone->flush_waiters); } - vdo_check_for_drain_complete(zone->map_zone); + vdo_block_map_check_for_drain_complete(zone->map_zone); +} + +/* + * Check whether the given value is between the lower and upper bounds, + * within a cyclic range of values from 0 to (modulus - 1). The value + * and both bounds must be smaller than the modulus. + * + * @lower: The lowest value to accept + * @value: The value to check + * @upper: The highest value to accept + * @modulus: The size of the cyclic space, no more than 2^15 + * @return whether the value is in range + */ +static bool in_cyclic_range(uint16_t lower, uint16_t value, + uint16_t upper, uint16_t modulus) +{ + if (value < lower) { + value += modulus; + } + if (upper < lower) { + upper += modulus; + } + return (value <= upper); } -/** +/* * Check whether a generation is strictly older than some other generation in * the context of a zone's current generation range. * - * @param zone The zone in which to do the comparison - * @param a The generation in question - * @param b The generation to compare to + * @zone: The zone in which to do the comparison + * @a: The generation in question + * @b: The generation to compare to * - * @return true if generation a is not strictly older than - * generation b in the context of the zone - **/ + * @return if generation @a is not strictly older than generation @b in the + * context of @zone + */ static bool __must_check is_not_older(struct block_map_tree_zone *zone, uint8_t a, uint8_t b) { @@ -276,13 +247,6 @@ is_not_older(struct block_map_tree_zone *zone, uint8_t a, uint8_t b) return in_cyclic_range(b, a, zone->generation, 1 << 8); } -/** - * Decrement the count for a generation and roll the oldest generation if there - * are no longer any active pages in it. - * - * @param zone The zone - * @param generation The generation to release - **/ static void release_generation(struct block_map_tree_zone *zone, uint8_t generation) { @@ -301,23 +265,15 @@ static void release_generation(struct block_map_tree_zone *zone, } } -/** - * Set the generation of a page and update the dirty page count in the zone. - * - * @param zone The zone which owns the page - * @param page The page - * @param new_generation The generation to set - * @param decrement_old Whether to decrement the count of the page's old - * generation - **/ static void set_generation(struct block_map_tree_zone *zone, - struct tree_page *page, uint8_t new_generation, - bool decrement_old) + struct tree_page *page, uint8_t new_generation) { uint32_t new_count; int result; + bool decrement_old = is_waiting(&page->waiter); uint8_t old_generation = page->generation; + if (decrement_old && (old_generation == new_generation)) { return; } @@ -337,34 +293,22 @@ static void set_generation(struct block_map_tree_zone *zone, } } -/**********************************************************************/ static void write_page(struct tree_page *tree_page, struct vio_pool_entry *entry); -/** - * Write out a dirty page if it is still covered by the most recent flush - * or if it is the flusher. - * - *

Implements waiter_callback - * - * @param waiter The page to write - * @param context The vio_pool_entry with which to do the write - **/ +/* + * Implements waiter_callback + */ static void write_page_callback(struct waiter *waiter, void *context) { write_page(container_of(waiter, struct tree_page, waiter), (struct vio_pool_entry *) context); } -/** - * Acquire a vio for writing a dirty page. - * - * @param waiter The page which needs a vio - * @param zone The zone - **/ static void acquire_vio(struct waiter *waiter, struct block_map_tree_zone *zone) { int result; + waiter->callback = write_page_callback; result = acquire_vio_from_pool(zone->vio_pool, waiter); if (result != VDO_SUCCESS) { @@ -372,17 +316,13 @@ static void acquire_vio(struct waiter *waiter, struct block_map_tree_zone *zone) } } -/** - * Attempt to increment the generation. - * - * @param zone The zone whose generation is to be incremented - * - * @return true if all possible generations were not already - * active - **/ +/* + * @return true if all possible generations were not already active + */ static bool attempt_increment(struct block_map_tree_zone *zone) { uint8_t generation = zone->generation + 1; + if (zone->oldest_generation == generation) { return false; } @@ -391,13 +331,9 @@ static bool attempt_increment(struct block_map_tree_zone *zone) return true; } -/** - * Enqueue a page to either launch a flush or wait for the current flush which - * is already in progress. - * - * @param page The page to enqueue - * @param zone The zone - **/ +/* + * Launches a flush if one is not already in progress. + */ static void enqueue_page(struct tree_page *page, struct block_map_tree_zone *zone) { @@ -415,19 +351,11 @@ static void enqueue_page(struct tree_page *page, } } -/** - * Write pages which were waiting for a flush and have not been redirtied. - * Requeue those pages which were redirtied. - * - *

Implements waiter_callback. - * - * @param waiter The dirty page - * @param context The zone and generation - **/ static void write_page_if_not_dirtied(struct waiter *waiter, void *context) { struct tree_page *page = container_of(waiter, struct tree_page, waiter); struct write_if_not_dirtied_context *write_context = context; + if (page->generation == write_context->generation) { acquire_vio(waiter, write_context->zone); return; @@ -436,25 +364,16 @@ static void write_page_if_not_dirtied(struct waiter *waiter, void *context) enqueue_page(page, write_context->zone); } -/** - * Return a vio to the zone's pool. - * - * @param zone The zone which owns the pool - * @param entry The pool entry to return - **/ static void return_to_pool(struct block_map_tree_zone *zone, struct vio_pool_entry *entry) { return_vio_to_pool(zone->vio_pool, entry); - vdo_check_for_drain_complete(zone->map_zone); + vdo_block_map_check_for_drain_complete(zone->map_zone); } -/** - * Handle the successful write of a tree page. This callback is registered in - * write_initialized_page(). - * - * @param completion The vio doing the write - **/ +/* + * This callback is registered in write_initialized_page(). + */ static void finish_page_write(struct vdo_completion *completion) { bool dirty; @@ -462,9 +381,10 @@ static void finish_page_write(struct vdo_completion *completion) struct vio_pool_entry *entry = completion->parent; struct tree_page *page = entry->parent; struct block_map_tree_zone *zone = entry->context; - release_vdo_recovery_journal_block_reference(zone->map_zone->block_map->journal, + + vdo_release_recovery_journal_block_reference(zone->map_zone->block_map->journal, page->writing_recovery_lock, - ZONE_TYPE_LOGICAL, + VDO_ZONE_TYPE_LOGICAL, zone->map_zone->zone_number); dirty = (page->writing_generation != page->generation); @@ -502,67 +422,74 @@ static void finish_page_write(struct vdo_completion *completion) return_to_pool(zone, entry); } -/** - * Handle an error writing a tree page. This error handler is registered in - * write_page() and write_initialized_page(). - * - * @param completion The vio doing the write - **/ static void handle_write_error(struct vdo_completion *completion) { int result = completion->result; struct vio_pool_entry *entry = completion->parent; struct block_map_tree_zone *zone = entry->context; + + record_metadata_io_error(as_vio(completion)); enter_zone_read_only_mode(zone, result); return_to_pool(zone, entry); } -/** - * Write a page which has been written at least once. This callback is - * registered in (or called directly from) write_page(). - * - * @param completion The vio which will do the write - **/ +static void write_page_endio(struct bio *bio); + static void write_initialized_page(struct vdo_completion *completion) { struct vio_pool_entry *entry = completion->parent; struct block_map_tree_zone *zone = (struct block_map_tree_zone *) entry->context; struct tree_page *tree_page = (struct tree_page *) entry->parent; + struct block_map_page *page = (struct block_map_page *) entry->buffer; + unsigned int operation = REQ_OP_WRITE | REQ_PRIO; /* - * Set the initialized field of the copy of the page we are writing to - * true. We don't want to set it true on the real page in memory until - * after this write succeeds. + * Now that we know the page has been written at least once, mark + * the copy we are writing as initialized. */ + vdo_mark_block_map_page_initialized(page, true); + + + if (zone->flusher == tree_page) { + operation |= REQ_PREFLUSH; + } + + submit_metadata_vio(entry->vio, + vdo_get_block_map_page_pbn(page), + write_page_endio, + handle_write_error, + operation); +} + +static void write_page_endio(struct bio *bio) +{ + struct vio *vio = bio->bi_private; + struct vio_pool_entry *entry = vio->completion.parent; + struct block_map_tree_zone *zone = entry->context; struct block_map_page *page = (struct block_map_page *) entry->buffer; - mark_vdo_block_map_page_initialized(page, true); - launch_write_metadata_vio_with_flush(entry->vio, - get_vdo_block_map_page_pbn(page), - finish_page_write, - handle_write_error, - (zone->flusher == tree_page), - false); + + continue_vio_after_io(vio, + (vdo_is_block_map_page_initialized(page) + ? finish_page_write + : write_initialized_page), + zone->map_zone->thread_id); } -/** - * Write a dirty tree page now that we have a vio with which to write it. - * - * @param tree_page The page to write - * @param entry The vio_pool_entry with which to write - **/ static void write_page(struct tree_page *tree_page, struct vio_pool_entry *entry) { struct block_map_tree_zone *zone = (struct block_map_tree_zone *) entry->context; struct vdo_completion *completion = vio_as_completion(entry->vio); - struct block_map_page *page = as_vdo_block_map_page(tree_page); + struct block_map_page *page = vdo_as_block_map_page(tree_page); if ((zone->flusher != tree_page) && (is_not_older(zone, tree_page->generation, zone->generation))) { - // This page was re-dirtied after the last flush was issued, - // hence we need to do another flush. + /* + * This page was re-dirtied after the last flush was issued, + * hence we need to do another flush. + */ enqueue_page(tree_page, zone); return_to_pool(zone, entry); return; @@ -577,35 +504,44 @@ static void write_page(struct tree_page *tree_page, tree_page->writing_generation = tree_page->generation; tree_page->writing_recovery_lock = tree_page->recovery_lock; - // Clear this now so that we know this page is not on any dirty list. + /* Clear this now so that we know this page is not on any dirty list. */ tree_page->recovery_lock = 0; - if (!mark_vdo_block_map_page_initialized(page, true)) { + /* + * We've already copied the page into the vio which will write it, so + * if it was not yet initialized, the first write will indicate that + * (for torn write protection). It is now safe to mark it as + * initialized in memory since if the write fails, the in memory state + * will become irrelevant. + */ + if (!vdo_mark_block_map_page_initialized(page, true)) { write_initialized_page(completion); return; } - launch_write_metadata_vio(entry->vio, get_vdo_block_map_page_pbn(page), - write_initialized_page, handle_write_error); + submit_metadata_vio(entry->vio, + vdo_get_block_map_page_pbn(page), + write_page_endio, + handle_write_error, + REQ_OP_WRITE | REQ_PRIO); } -/** +/* * Schedule a batch of dirty pages for writing. * - *

Implements vdo_dirty_callback. - * - * @param expired The pages to write - * @param context The zone + * Implements vdo_dirty_callback. **/ static void write_dirty_pages_callback(struct list_head *expired, void *context) { struct block_map_tree_zone *zone = (struct block_map_tree_zone *) context; uint8_t generation = zone->generation; + while (!list_empty(expired)) { int result; struct list_head *entry = expired->next; struct tree_page *page = tree_page_from_list_entry(entry); + list_del_init(entry); result = ASSERT(!is_waiting(&page->waiter), @@ -615,42 +551,41 @@ static void write_dirty_pages_callback(struct list_head *expired, void *context) continue; } - set_generation(zone, page, generation, false); + set_generation(zone, page, generation); if (!page->writing) { enqueue_page(page, zone); } } } -/**********************************************************************/ void vdo_advance_zone_tree_period(struct block_map_tree_zone *zone, sequence_number_t period) { - advance_vdo_dirty_lists_period(zone->dirty_lists, period); + vdo_advance_dirty_lists_period(zone->dirty_lists, period); } -/**********************************************************************/ +/* + * This method must not be called when lookups are active. + **/ void vdo_drain_zone_trees(struct block_map_tree_zone *zone) { ASSERT_LOG_ONLY((zone->active_lookups == 0), "vdo_drain_zone_trees() called with no active lookups"); - if (!is_vdo_state_suspending(&zone->map_zone->state)) { - flush_vdo_dirty_lists(zone->dirty_lists); + if (!vdo_is_state_suspending(&zone->map_zone->state)) { + vdo_flush_dirty_lists(zone->dirty_lists); } } -/** +/* * Release a lock on a page which was being loaded or allocated. - * - * @param data_vio The data_vio releasing the page lock - * @param what What the data_vio was doing (for logging) - **/ + */ static void release_page_lock(struct data_vio *data_vio, char *what) { struct block_map_tree_zone *zone; struct tree_lock *lock_holder; struct tree_lock *lock = &data_vio->tree_lock; + ASSERT_LOG_ONLY(lock->locked, "release of unlocked block map page %s for key %llu in tree %u", what, (unsigned long long) lock->key, @@ -665,38 +600,27 @@ static void release_page_lock(struct data_vio *data_vio, char *what) lock->locked = false; } -/** - * Continue a data_vio now that the lookup is complete. - * - * @param data_vio The data_vio - * @param result The result of the lookup - **/ static void finish_lookup(struct data_vio *data_vio, int result) { struct block_map_tree_zone *zone; struct vdo_completion *completion = data_vio_as_completion(data_vio); + data_vio->tree_lock.height = 0; zone = get_block_map_tree_zone(data_vio); --zone->active_lookups; - set_vdo_completion_result(completion, result); - launch_vdo_completion_callback(completion, + vdo_set_completion_result(completion, result); + vdo_launch_completion_callback(completion, data_vio->tree_lock.callback, data_vio->tree_lock.thread_id); } -/** - * Abort a block map PBN lookup due to an error in the load or allocation on - * which we were waiting. - * - * @param waiter The data_vio which was waiting for a page load or allocation - * @param context The error which caused the abort - **/ static void abort_lookup_for_waiter(struct waiter *waiter, void *context) { struct data_vio *data_vio = waiter_as_data_vio(waiter); int result = *((int *) context); + if (is_read_data_vio(data_vio)) { if (result == VDO_NO_SPACE) { result = VDO_SUCCESS; @@ -708,13 +632,6 @@ static void abort_lookup_for_waiter(struct waiter *waiter, void *context) finish_lookup(data_vio, result); } -/** - * Abort a block map PBN lookup due to an error loading or allocating a page. - * - * @param data_vio The data_vio which was loading or allocating a page - * @param result The error code - * @param what What the data_vio was doing (for logging) - **/ static void abort_lookup(struct data_vio *data_vio, int result, char *what) { if (result != VDO_NO_SPACE) { @@ -731,26 +648,11 @@ static void abort_lookup(struct data_vio *data_vio, int result, char *what) finish_lookup(data_vio, result); } -/** - * Abort a block map PBN lookup due to an error loading a page. - * - * @param data_vio The data_vio doing the page load - * @param result The error code - **/ static void abort_load(struct data_vio *data_vio, int result) { abort_lookup(data_vio, result, "load"); } -/** - * Determine if a location represents a valid mapping for a tree page. - * - * @param vdo The vdo - * @param mapping The data_location to check - * @param height The height of the entry in the tree - * - * @return true if the entry represents a invalid page mapping - **/ static bool __must_check is_invalid_tree_entry(const struct vdo *vdo, const struct data_location *mapping, @@ -763,7 +665,7 @@ is_invalid_tree_entry(const struct vdo *vdo, return true; } - // Roots aren't physical data blocks, so we can't check their PBNs. + /* Roots aren't physical data blocks, so we can't check their PBNs. */ if (height == VDO_BLOCK_MAP_TREE_HEIGHT) { return false; } @@ -771,29 +673,21 @@ is_invalid_tree_entry(const struct vdo *vdo, return !vdo_is_physical_data_block(vdo->depot, mapping->pbn); } -/**********************************************************************/ static void load_block_map_page(struct block_map_tree_zone *zone, struct data_vio *data_vio); static void allocate_block_map_page(struct block_map_tree_zone *zone, struct data_vio *data_vio); -/** - * Continue a block map PBN lookup now that a page has been loaded by - * descending one level in the tree. - * - * @param data_vio The data_vio doing the lookup - * @param page The page which was just loaded - **/ static void continue_with_loaded_page(struct data_vio *data_vio, struct block_map_page *page) { struct tree_lock *lock = &data_vio->tree_lock; struct block_map_tree_slot slot = lock->tree_slots[lock->height]; struct data_location mapping = - unpack_vdo_block_map_entry(&page->entries[slot.block_map_slot.slot]); - if (is_invalid_tree_entry(get_vdo_from_data_vio(data_vio), &mapping, - lock->height)) { + vdo_unpack_block_map_entry(&page->entries[slot.block_map_slot.slot]); + if (is_invalid_tree_entry(vdo_from_data_vio(data_vio), &mapping, + lock->height)) { uds_log_error_strerror(VDO_BAD_MAPPING, "Invalid block map tree PBN: %llu with state %u for page index %u at height %u", (unsigned long long) mapping.pbn, @@ -805,7 +699,7 @@ static void continue_with_loaded_page(struct data_vio *data_vio, } if (!vdo_is_mapped_location(&mapping)) { - // The page we need is unallocated + /* The page we need is unallocated */ allocate_block_map_page(get_block_map_tree_zone(data_vio), data_vio); return; @@ -817,30 +711,18 @@ static void continue_with_loaded_page(struct data_vio *data_vio, return; } - // We know what page we need to load next + /* We know what page we need to load next */ load_block_map_page(get_block_map_tree_zone(data_vio), data_vio); } -/** - * Continue a block map PBN lookup now that the page load we were waiting on - * has finished. - * - * @param waiter The data_vio waiting for a page to be loaded - * @param context The page which was just loaded - **/ static void continue_load_for_waiter(struct waiter *waiter, void *context) { struct data_vio *data_vio = waiter_as_data_vio(waiter); + data_vio->tree_lock.height--; continue_with_loaded_page(data_vio, (struct block_map_page *) context); } -/** - * Finish loading a page now that it has been read in from disk. This callback - * is registered in load_page(). - * - * @param completion The vio doing the page read - **/ static void finish_block_map_page_load(struct vdo_completion *completion) { physical_block_number_t pbn; @@ -853,6 +735,7 @@ static void finish_block_map_page_load(struct vdo_completion *completion) struct block_map_tree_zone *zone = (struct block_map_tree_zone *) entry->context; struct tree_lock *tree_lock = &data_vio->tree_lock; + tree_lock->height--; pbn = tree_lock->tree_slots[tree_lock->height].block_map_slot.pbn; tree_page = get_tree_page(zone, tree_lock); @@ -860,21 +743,16 @@ static void finish_block_map_page_load(struct vdo_completion *completion) nonce = zone->map_zone->block_map->nonce; if (!vdo_copy_valid_page(entry->buffer, nonce, pbn, page)) { - format_vdo_block_map_page(page, nonce, pbn, false); + vdo_format_block_map_page(page, nonce, pbn, false); } return_vio_to_pool(zone->vio_pool, entry); - // Release our claim to the load and wake any waiters + /* Release our claim to the load and wake any waiters */ release_page_lock(data_vio, "load"); notify_all_waiters(&tree_lock->waiters, continue_load_for_waiter, page); continue_with_loaded_page(data_vio, page); } -/** - * Handle an error loading a tree page. - * - * @param completion The vio doing the page read - **/ static void handle_io_error(struct vdo_completion *completion) { int result = completion->result; @@ -882,43 +760,43 @@ static void handle_io_error(struct vdo_completion *completion) struct data_vio *data_vio = entry->parent; struct block_map_tree_zone *zone = (struct block_map_tree_zone *) entry->context; + + record_metadata_io_error(as_vio(completion)); return_vio_to_pool(zone->vio_pool, entry); abort_load(data_vio, result); } -/** - * Read a tree page from disk now that we've gotten a vio with which to do the - * read. This waiter_callback is registered in load_block_map_page(). - * - * @param waiter The data_vio which requires a page load - * @param context The vio pool entry with which to do the read - **/ +static void load_page_endio(struct bio *bio) +{ + struct vio *vio = bio->bi_private; + struct vio_pool_entry *entry = vio->completion.parent; + struct data_vio *data_vio = entry->parent; + + continue_vio_after_io(vio, + finish_block_map_page_load, + data_vio->logical.zone->thread_id); +} + static void load_page(struct waiter *waiter, void *context) { struct vio_pool_entry *entry = context; struct data_vio *data_vio = waiter_as_data_vio(waiter); struct tree_lock *lock = &data_vio->tree_lock; + physical_block_number_t pbn = + lock->tree_slots[lock->height - 1].block_map_slot.pbn; entry->parent = data_vio; - entry->vio->completion.callback_thread_id = - get_vdo_logical_zone_block_map(data_vio->logical.zone)->thread_id; - - launch_read_metadata_vio(entry->vio, - lock->tree_slots[lock->height - 1].block_map_slot.pbn, - finish_block_map_page_load, - handle_io_error); + submit_metadata_vio(entry->vio, + pbn, + load_page_endio, + handle_io_error, + REQ_OP_READ | REQ_PRIO); } -/** - * Attempt to acquire a lock on a page in the block map tree. If the page is - * already locked, queue up to wait for the lock to be released. If the lock is - * acquired, the data_vio's tree_lock.locked field will be set to true. - * - * @param zone The block_map_tree_zone in which the data_vio operates - * @param data_vio The data_vio which desires a page lock - * - * @return VDO_SUCCESS or an error - **/ +/* + * If the page is already locked, queue up to wait for the lock to be released. + * If the lock is acquired, @data_vio->tree_lock.locked will be true. + */ static int attempt_page_lock(struct block_map_tree_zone *zone, struct data_vio *data_vio) { @@ -928,7 +806,8 @@ static int attempt_page_lock(struct block_map_tree_zone *zone, struct tree_lock *lock = &data_vio->tree_lock; height_t height = lock->height; struct block_map_tree_slot tree_slot = lock->tree_slots[height]; - page_key key; + union page_key key; + key.descriptor = (struct page_descriptor) { .root_index = lock->root_index, .height = height, @@ -944,25 +823,24 @@ static int attempt_page_lock(struct block_map_tree_zone *zone, } if (lock_holder == NULL) { - // We got the lock + /* We got the lock */ data_vio->tree_lock.locked = true; return VDO_SUCCESS; } - // Someone else is loading or allocating the page we need + /* Someone else is loading or allocating the page we need */ return enqueue_data_vio(&lock_holder->waiters, data_vio); } -/** - * Load a block map tree page from disk. - * - * @param zone The block_map_tree_zone in which the data_vio operates - * @param data_vio The data_vio which requires a page to be loaded - **/ +/* + * Load a block map tree page from disk, for the next level in the data vio + * tree lock. + */ static void load_block_map_page(struct block_map_tree_zone *zone, struct data_vio *data_vio) { int result = attempt_page_lock(zone, data_vio); + if (result != VDO_SUCCESS) { abort_load(data_vio, result); return; @@ -970,6 +848,7 @@ static void load_block_map_page(struct block_map_tree_zone *zone, if (data_vio->tree_lock.locked) { struct waiter *waiter = data_vio_as_waiter(data_vio); + waiter->callback = load_page; result = acquire_vio_from_pool(zone->vio_pool, waiter); if (result != VDO_SUCCESS) { @@ -978,51 +857,33 @@ static void load_block_map_page(struct block_map_tree_zone *zone, } } -/** - * Set the callback of a data_vio after it has allocated a block map page. - * - * @param data_vio The data_vio - **/ static void set_post_allocation_callback(struct data_vio *data_vio) { - set_vdo_completion_callback(data_vio_as_completion(data_vio), + vdo_set_completion_callback(data_vio_as_completion(data_vio), data_vio->tree_lock.callback, data_vio->tree_lock.thread_id); } -/** - * Abort a block map PBN lookup due to an error allocating a page. - * - * @param data_vio The data_vio doing the page allocation - * @param result The error code - **/ static void abort_allocation(struct data_vio *data_vio, int result) { set_post_allocation_callback(data_vio); abort_lookup(data_vio, result, "allocation"); } -/** - * Callback to handle an error while attempting to allocate a page. This - * callback is used to transfer back to the logical zone along the block map - * page allocation path. - * - * @param completion The data_vio doing the allocation - **/ static void allocation_failure(struct vdo_completion *completion) { struct data_vio *data_vio = as_data_vio(completion); - assert_data_vio_in_logical_zone(data_vio); + + if (vdo_get_callback_thread_id() != + data_vio->logical.zone->thread_id) { + launch_data_vio_logical_callback(data_vio, allocation_failure); + return; + } + + completion->error_handler = NULL; abort_allocation(data_vio, completion->result); } -/** - * Continue with page allocations now that a parent page has been allocated. - * - * @param waiter The data_vio which was waiting for a page to be allocated - * @param context The physical block number of the page which was just - * allocated - **/ static void continue_allocation_for_waiter(struct waiter *waiter, void *context) { struct data_vio *data_vio = waiter_as_data_vio(waiter); @@ -1041,13 +902,10 @@ static void continue_allocation_for_waiter(struct waiter *waiter, void *context) allocate_block_map_page(get_block_map_tree_zone(data_vio), data_vio); } -/** - * Finish the page allocation process by recording the allocation in the tree - * and waking any waiters now that the write lock has been released. This - * callback is registered in release_block_map_write_lock(). - * - * @param completion The data_vio doing the allocation - **/ +/* + * Record the allocation in the tree and wake any waiters now that the write + * lock has been released. + */ static void finish_block_map_allocation(struct vdo_completion *completion) { physical_block_number_t pbn; @@ -1059,36 +917,37 @@ static void finish_block_map_allocation(struct vdo_completion *completion) struct block_map_tree_zone *zone = get_block_map_tree_zone(data_vio); struct tree_lock *tree_lock = &data_vio->tree_lock; height_t height = tree_lock->height; + assert_data_vio_in_logical_zone(data_vio); - if (completion->result != VDO_SUCCESS) { - allocation_failure(completion); - return; - } + + completion->error_handler = NULL; tree_page = get_tree_page(zone, tree_lock); pbn = tree_lock->tree_slots[height - 1].block_map_slot.pbn; - // Record the allocation. + /* Record the allocation. */ page = (struct block_map_page *) tree_page->page_buffer; old_lock = tree_page->recovery_lock; - update_vdo_block_map_page(page, data_vio, pbn, + vdo_update_block_map_page(page, data_vio, pbn, VDO_MAPPING_STATE_UNCOMPRESSED, &tree_page->recovery_lock); if (is_waiting(&tree_page->waiter)) { - // This page is waiting to be written out. + /* This page is waiting to be written out. */ if (zone->flusher != tree_page) { - // The outstanding flush won't cover the update we just - // made, so mark the page as needing another flush. - set_generation(zone, tree_page, zone->generation, true); + /* + * The outstanding flush won't cover the update we just + * made, so mark the page as needing another flush. + */ + set_generation(zone, tree_page, zone->generation); } } else { - // Put the page on a dirty list + /* Put the page on a dirty list */ if (old_lock == 0) { INIT_LIST_HEAD(&tree_page->entry); } - add_to_vdo_dirty_lists(zone->dirty_lists, + vdo_add_to_dirty_lists(zone->dirty_lists, &tree_page->entry, old_lock, tree_page->recovery_lock); @@ -1096,15 +955,15 @@ static void finish_block_map_allocation(struct vdo_completion *completion) tree_lock->height--; if (height > 1) { - // Format the interior node we just allocated (in memory). + /* Format the interior node we just allocated (in memory). */ tree_page = get_tree_page(zone, tree_lock); - format_vdo_block_map_page(tree_page->page_buffer, + vdo_format_block_map_page(tree_page->page_buffer, zone->map_zone->block_map->nonce, pbn, false); } - // Release our claim to the allocation and wake any waiters + /* Release our claim to the allocation and wake any waiters */ release_page_lock(data_vio, "allocation"); notify_all_waiters(&tree_lock->waiters, continue_allocation_for_waiter, &pbn); @@ -1116,37 +975,22 @@ static void finish_block_map_allocation(struct vdo_completion *completion) allocate_block_map_page(zone, data_vio); } -/** - * Release the write lock on a newly allocated block map page now that we - * have made its journal entries and reference count updates. This callback - * is registered in set_block_map_page_reference_count(). - * - * @param completion The data_vio doing the allocation - **/ static void release_block_map_write_lock(struct vdo_completion *completion) { struct data_vio *data_vio = as_data_vio(completion); - struct allocating_vio *allocating_vio = - data_vio_as_allocating_vio(data_vio); + assert_data_vio_in_allocated_zone(data_vio); - if (completion->result != VDO_SUCCESS) { - launch_data_vio_logical_callback(data_vio, allocation_failure); - return; - } - vio_release_allocation_lock(allocating_vio); - vio_reset_allocation(allocating_vio); - launch_data_vio_logical_callback(data_vio, finish_block_map_allocation); + release_data_vio_allocation_lock(data_vio, true); + launch_data_vio_logical_callback(data_vio, + finish_block_map_allocation); } -/** - * Set the reference count of a newly allocated block map page to - * MAXIMUM_REFERENCES now that we have made a recovery journal entry for it. - * MAXIMUM_REFERENCES is used to prevent deduplication against the block after +/* + * Newly allocated block map pages are set to have to MAXIMUM_REFERENCES after + * they are journaled, to prevent deduplication against the block after * we release the write lock on it, but before we write out the page. - * - * @param completion The data_vio doing the allocation - **/ + */ static void set_block_map_page_reference_count(struct vdo_completion *completion) { @@ -1154,82 +998,61 @@ set_block_map_page_reference_count(struct vdo_completion *completion) struct data_vio *data_vio = as_data_vio(completion); struct tree_lock *lock = &data_vio->tree_lock; + assert_data_vio_in_allocated_zone(data_vio); - if (completion->result != VDO_SUCCESS) { - launch_data_vio_logical_callback(data_vio, allocation_failure); - return; - } pbn = lock->tree_slots[lock->height - 1].block_map_slot.pbn; completion->callback = release_block_map_write_lock; - add_vdo_slab_journal_entry(get_vdo_slab_journal(get_vdo_from_data_vio(data_vio)->depot, + vdo_add_slab_journal_entry(vdo_get_slab_journal(completion->vdo->depot, pbn), data_vio); } -/** - * Make a recovery journal entry for a newly allocated block map page. - * This callback is registered in continue_block_map_page_allocation(). - * - * @param completion The data_vio doing the allocation - **/ static void journal_block_map_allocation(struct vdo_completion *completion) { struct data_vio *data_vio = as_data_vio(completion); + assert_data_vio_in_journal_zone(data_vio); - if (completion->result != VDO_SUCCESS) { - launch_data_vio_logical_callback(data_vio, allocation_failure); - return; - } - set_data_vio_allocated_zone_callback(data_vio, set_block_map_page_reference_count); - add_vdo_recovery_journal_entry(get_vdo_from_data_vio(data_vio)->recovery_journal, + set_data_vio_allocated_zone_callback(data_vio, + set_block_map_page_reference_count); + vdo_add_recovery_journal_entry(vdo_from_data_vio(data_vio)->recovery_journal, data_vio); } -/** - * Continue the process of allocating a block map page now that the - * block_allocator has given us a block. This method is supplied as the - * callback to vio_allocate_data_block() by allocate_block_map_page(). - * - * @param allocating_vio The data_vio which is doing the allocation - **/ -static void -continue_block_map_page_allocation(struct allocating_vio *allocating_vio) +static void allocate_block(struct vdo_completion *completion) { - struct data_vio *data_vio = allocating_vio_as_data_vio(allocating_vio); + struct data_vio *data_vio = as_data_vio(completion); struct tree_lock *lock = &data_vio->tree_lock; - physical_block_number_t pbn = allocating_vio->allocation; + physical_block_number_t pbn; + + assert_data_vio_in_allocated_zone(data_vio); - if (!data_vio_has_allocation(data_vio)) { - set_data_vio_logical_callback(data_vio, allocation_failure); - continue_data_vio(data_vio, VDO_NO_SPACE); + if (!vdo_allocate_block_in_zone(data_vio)) { return; } + pbn = data_vio->allocation.pbn; lock->tree_slots[lock->height - 1].block_map_slot.pbn = pbn; - set_up_vdo_reference_operation_with_lock(BLOCK_MAP_INCREMENT, + vdo_set_up_reference_operation_with_lock(VDO_JOURNAL_BLOCK_MAP_INCREMENT, pbn, VDO_MAPPING_STATE_UNCOMPRESSED, - allocating_vio->allocation_lock, + data_vio->allocation.lock, &data_vio->operation); - launch_data_vio_journal_callback(data_vio, journal_block_map_allocation); + launch_data_vio_journal_callback(data_vio, + journal_block_map_allocation); } -/** - * Allocate a block map page. - * - * @param zone The zone in which the data_vio is operating - * @param data_vio The data_vio which needs to allocate a page - **/ static void allocate_block_map_page(struct block_map_tree_zone *zone, struct data_vio *data_vio) { int result; if (!is_write_data_vio(data_vio) || is_trim_data_vio(data_vio)) { - // This is a pure read, the read phase of a read-modify-write, - // or a trim, so there's nothing left to do here. + /* + * This is a pure read, the read phase of a read-modify-write, + * or a trim, so there's nothing left to do here. + */ finish_lookup(data_vio, VDO_SUCCESS); return; } @@ -1244,13 +1067,17 @@ static void allocate_block_map_page(struct block_map_tree_zone *zone, return; } - vio_allocate_data_block(data_vio_as_allocating_vio(data_vio), - get_vdo_logical_zone_allocation_selector(data_vio->logical.zone), - VIO_BLOCK_MAP_WRITE_LOCK, - continue_block_map_page_allocation); + data_vio_allocate_data_block(data_vio, + VIO_BLOCK_MAP_WRITE_LOCK, + allocate_block, + allocation_failure); } -/**********************************************************************/ +/* + * Look up the PBN of the block map page containing the mapping for a + * data_vio's LBN. All ancestors in the tree will be allocated or loaded, as + * needed. + */ void vdo_lookup_block_map_pbn(struct data_vio *data_vio) { page_number_t page_index; @@ -1260,8 +1087,9 @@ void vdo_lookup_block_map_pbn(struct data_vio *data_vio) struct block_map_page *page = NULL; struct tree_lock *lock = &data_vio->tree_lock; struct block_map_tree_zone *zone = get_block_map_tree_zone(data_vio); + zone->active_lookups++; - if (is_vdo_state_draining(&zone->map_zone->state)) { + if (vdo_is_state_draining(&zone->map_zone->state)) { finish_lookup(data_vio, VDO_SHUTTING_DOWN); return; } @@ -1279,25 +1107,26 @@ void vdo_lookup_block_map_pbn(struct data_vio *data_vio) for (lock->height = 1; lock->height <= VDO_BLOCK_MAP_TREE_HEIGHT; lock->height++) { physical_block_number_t pbn; + lock->tree_slots[lock->height] = tree_slot; page = (struct block_map_page *) (get_tree_page(zone, lock)->page_buffer); - pbn = get_vdo_block_map_page_pbn(page); + pbn = vdo_get_block_map_page_pbn(page); if (pbn != VDO_ZERO_BLOCK) { lock->tree_slots[lock->height].block_map_slot.pbn = pbn; break; } - // Calculate the index and slot for the next level. + /* Calculate the index and slot for the next level. */ tree_slot.block_map_slot.slot = tree_slot.page_index % VDO_BLOCK_MAP_ENTRIES_PER_PAGE; tree_slot.page_index = tree_slot.page_index / VDO_BLOCK_MAP_ENTRIES_PER_PAGE; } - // The page at this height has been allocated and loaded. + /* The page at this height has been allocated and loaded. */ mapping = - unpack_vdo_block_map_entry(&page->entries[tree_slot.block_map_slot.slot]); - if (is_invalid_tree_entry(get_vdo_from_data_vio(data_vio), &mapping, + vdo_unpack_block_map_entry(&page->entries[tree_slot.block_map_slot.slot]); + if (is_invalid_tree_entry(vdo_from_data_vio(data_vio), &mapping, lock->height)) { uds_log_error_strerror(VDO_BAD_MAPPING, "Invalid block map tree PBN: %llu with state %u for page index %u at height %u", @@ -1310,24 +1139,30 @@ void vdo_lookup_block_map_pbn(struct data_vio *data_vio) } if (!vdo_is_mapped_location(&mapping)) { - // The page we want one level down has not been allocated, so - // allocate it. + /* + * The page we want one level down has not been allocated, so + * allocate it. + */ allocate_block_map_page(zone, data_vio); return; } lock->tree_slots[lock->height - 1].block_map_slot.pbn = mapping.pbn; if (lock->height == 1) { - // This is the ultimate block map page, so we're done + /* This is the ultimate block map page, so we're done */ finish_lookup(data_vio, VDO_SUCCESS); return; } - // We know what page we need to load. + /* We know what page we need to load. */ load_block_map_page(zone, data_vio); } -/**********************************************************************/ +/* + * Find the PBN of a leaf block map page. This method may only be used after + * all allocated tree pages have been loaded, otherwise, it may give the wrong + * answer (0). + */ physical_block_number_t vdo_find_block_map_page_pbn(struct block_map *map, page_number_t page_number) { @@ -1338,16 +1173,17 @@ physical_block_number_t vdo_find_block_map_page_pbn(struct block_map *map, root_count_t root_index = page_number % map->root_count; page_number_t page_index = page_number / map->root_count; slot_number_t slot = page_index % VDO_BLOCK_MAP_ENTRIES_PER_PAGE; + page_index /= VDO_BLOCK_MAP_ENTRIES_PER_PAGE; tree_page = - get_vdo_tree_page_by_index(map->forest, root_index, 1, page_index); + vdo_get_tree_page_by_index(map->forest, root_index, 1, page_index); page = (struct block_map_page *) tree_page->page_buffer; - if (!is_vdo_block_map_page_initialized(page)) { + if (!vdo_is_block_map_page_initialized(page)) { return VDO_ZERO_BLOCK; } - mapping = unpack_vdo_block_map_entry(&page->entries[slot]); + mapping = vdo_unpack_block_map_entry(&page->entries[slot]); if (!vdo_is_valid_location(&mapping) || vdo_is_state_compressed(mapping.state)) { return VDO_ZERO_BLOCK; @@ -1355,16 +1191,21 @@ physical_block_number_t vdo_find_block_map_page_pbn(struct block_map *map, return mapping.pbn; } -/**********************************************************************/ +/* + * Write a tree page or indicate that it has been re-dirtied if it is already + * being written. This method is used when correcting errors in the tree during + * read-only rebuild. + */ void vdo_write_tree_page(struct tree_page *page, struct block_map_tree_zone *zone) { bool waiting = is_waiting(&page->waiter); + if (waiting && (zone->flusher == page)) { return; } - set_generation(zone, page, zone->generation, waiting); + set_generation(zone, page, zone->generation); if (waiting || page->writing) { return; } diff --git a/vdo/block-map-tree.h b/vdo/block-map-tree.h new file mode 100644 index 00000000..f9875ffa --- /dev/null +++ b/vdo/block-map-tree.h @@ -0,0 +1,92 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright Red Hat + */ + +#ifndef BLOCK_MAP_TREE_H +#define BLOCK_MAP_TREE_H + +#include + +#include "block-map-format.h" +#include "block-map-page.h" +#include "constants.h" +#include "kernel-types.h" +#include "types.h" +#include "wait-queue.h" + +struct tree_page { + struct waiter waiter; + + /* Dirty list entry */ + struct list_head entry; + + /* + * If dirty, the tree zone flush generation in which it was last + * dirtied. + */ + uint8_t generation; + + /* Whether this page is an interior tree page being written out. */ + bool writing; + + /* + * If writing, the tree zone flush generation of the copy being + * written. + */ + uint8_t writing_generation; + + /* + * Sequence number of the earliest recovery journal block containing + * uncommitted updates to this page + */ + sequence_number_t recovery_lock; + + /* + * The value of recovery_lock when the this page last started writing + */ + sequence_number_t writing_recovery_lock; + + char page_buffer[VDO_BLOCK_SIZE]; +}; + +/* + * Used to indicate that the page holding the location of a tree root has been + * "loaded". + */ +extern const physical_block_number_t VDO_INVALID_PBN; + +static inline struct block_map_page * __must_check +vdo_as_block_map_page(struct tree_page *tree_page) +{ + return (struct block_map_page *) tree_page->page_buffer; +} + +bool vdo_copy_valid_page(char *buffer, nonce_t nonce, + physical_block_number_t pbn, + struct block_map_page *page); + +int __must_check vdo_initialize_tree_zone(struct block_map_zone *zone, + struct vdo *vdo, + block_count_t maximum_age); + +void vdo_uninitialize_block_map_tree_zone(struct block_map_tree_zone *tree_zone); + +void vdo_set_tree_zone_initial_period(struct block_map_tree_zone *tree_zone, + sequence_number_t period); + +bool __must_check vdo_is_tree_zone_active(struct block_map_tree_zone *zone); + +void vdo_advance_zone_tree_period(struct block_map_tree_zone *zone, + sequence_number_t period); + +void vdo_drain_zone_trees(struct block_map_tree_zone *zone); + +void vdo_lookup_block_map_pbn(struct data_vio *data_vio); + +physical_block_number_t vdo_find_block_map_page_pbn(struct block_map *map, + page_number_t page_number); + +void vdo_write_tree_page(struct tree_page *page, struct block_map_tree_zone *zone); + +#endif /* BLOCK_MAP_TREE_H */ diff --git a/vdo/blockMap.c b/vdo/block-map.c similarity index 55% rename from vdo/blockMap.c rename to vdo/block-map.c index 60372c9a..6af2faa9 100644 --- a/vdo/blockMap.c +++ b/vdo/block-map.c @@ -1,64 +1,58 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/blockMap.c#42 $ */ -#include "blockMap.h" +#include "block-map.h" #include "logger.h" -#include "memoryAlloc.h" +#include "memory-alloc.h" #include "permassert.h" -#include "actionManager.h" -#include "adminState.h" -#include "blockMapFormat.h" -#include "blockMapInternals.h" -#include "blockMapPage.h" -#include "blockMapTree.h" +#include "action-manager.h" +#include "admin-state.h" +#include "block-map-format.h" +#include "block-map-page.h" +#include "block-map-tree.h" #include "constants.h" -#include "dataVIO.h" +#include "data-vio.h" #include "forest.h" -#include "numUtils.h" -#include "recoveryJournal.h" -#include "statusCodes.h" +#include "num-utils.h" +#include "recovery-journal.h" +#include "status-codes.h" #include "types.h" -#include "vdoInternal.h" -#include "vdoPageCache.h" +#include "vdo.h" +#include "vdo-page-cache.h" /** - * State associated which each block map page while it is in the VDO page - * cache. - **/ + * DOC: Block map eras + * + * The block map era, or maximum age, is used as follows: + * + * Each block map page, when dirty, records the earliest recovery journal block + * sequence number of the changes reflected in that dirty block. Sequence + * numbers are classified into eras: every @maximum_age sequence numbers, we + * switch to a new era. Block map pages are assigned to eras according to the + * sequence number they record. + * + * In the current (newest) era, block map pages are not written unless there is + * cache pressure. In the next oldest era, each time a new journal block is + * written 1/@maximum_age of the pages in this era are issued for write. In all + * older eras, pages are issued for write immediately. + */ + struct block_map_page_context { - /** + /* * The earliest recovery journal block containing uncommitted updates * to the block map page associated with this context. A reference * (lock) is held on that block to prevent it from being reaped. When * this value changes, the reference on the old value must be released * and a reference on the new value must be acquired. - **/ + */ sequence_number_t recovery_lock; }; -/** - * Implements vdo_page_read_function. - **/ +/* Implements vdo_page_read_function */ static int validate_page_on_read(void *buffer, physical_block_number_t pbn, struct block_map_zone *zone, @@ -69,27 +63,27 @@ static int validate_page_on_read(void *buffer, nonce_t nonce = zone->block_map->nonce; enum block_map_page_validity validity = - validate_vdo_block_map_page(page, nonce, pbn); + vdo_validate_block_map_page(page, nonce, pbn); if (validity == VDO_BLOCK_MAP_PAGE_BAD) { return uds_log_error_strerror(VDO_BAD_PAGE, "Expected page %llu but got page %llu instead", (unsigned long long) pbn, - (unsigned long long) get_vdo_block_map_page_pbn(page)); + (unsigned long long) vdo_get_block_map_page_pbn(page)); } if (validity == VDO_BLOCK_MAP_PAGE_INVALID) { - format_vdo_block_map_page(page, nonce, pbn, false); + vdo_format_block_map_page(page, nonce, pbn, false); } context->recovery_lock = 0; return VDO_SUCCESS; } -/** +/* * Handle journal updates and torn write protection. * * Implements vdo_page_write_function. - **/ + */ static bool handle_page_write(void *raw_page, struct block_map_zone *zone, void *page_context) @@ -97,34 +91,25 @@ static bool handle_page_write(void *raw_page, struct block_map_page *page = raw_page; struct block_map_page_context *context = page_context; - if (mark_vdo_block_map_page_initialized(page, true)) { - // Cause the page to be re-written. + if (vdo_mark_block_map_page_initialized(page, true)) { + /* Make the page be re-written for torn write protection. */ return true; } - // Release the page's references on the recovery journal. - release_vdo_recovery_journal_block_reference(zone->block_map->journal, + vdo_release_recovery_journal_block_reference(zone->block_map->journal, context->recovery_lock, - ZONE_TYPE_LOGICAL, + VDO_ZONE_TYPE_LOGICAL, zone->zone_number); context->recovery_lock = 0; return false; } -/** +/* * Initialize the per-zone portions of the block map. * - * @param map The block map - * @param zone_number The number of the zone to initialize - * @param thread_config The thread config of the VDO - * @param vdo The VDO - * @param read_only_notifier The read-only context for the VDO - * @param cache_size The size of the page cache for the block map - * @param maximum_age The number of journal blocks before a dirtied - * page is considered old and must be written out - * - * @return VDO_SUCCESS or an error - **/ + * @maximum_age: The number of journal blocks before a dirtied page is + * considered old and must be written out + */ static int __must_check initialize_block_map_zone(struct block_map *map, zone_count_t zone_number, @@ -137,6 +122,7 @@ initialize_block_map_zone(struct block_map *map, int result; struct block_map_zone *zone = &map->zones[zone_number]; + zone->zone_number = zone_number; zone->thread_id = vdo_get_logical_zone_thread(thread_config, zone_number); @@ -147,10 +133,10 @@ initialize_block_map_zone(struct block_map *map, return result; } - set_vdo_admin_state_code(&zone->state, + vdo_set_admin_state_code(&zone->state, VDO_ADMIN_STATE_NORMAL_OPERATION); - return make_vdo_page_cache(vdo, + return vdo_make_page_cache(vdo, cache_size / map->zone_count, validate_page_on_read, handle_page_write, @@ -160,89 +146,69 @@ initialize_block_map_zone(struct block_map *map, &zone->page_cache); } -/**********************************************************************/ -struct block_map_zone *vdo_get_block_map_zone(struct block_map *map, - zone_count_t zone_number) -{ - return &map->zones[zone_number]; -} - -/** - * Get the ID of the thread on which a given block map zone operates. - * - *

Implements vdo_zone_thread_getter. - **/ +/* Implements vdo_zone_thread_getter */ static thread_id_t get_block_map_zone_thread_id(void *context, zone_count_t zone_number) { - return vdo_get_block_map_zone(context, zone_number)->thread_id; + struct block_map *map = context; + + return map->zones[zone_number].thread_id; } -/** - * Prepare for an era advance. - * - *

Implements vdo_action_preamble. - **/ +/* Implements vdo_action_preamble */ static void prepare_for_era_advance(void *context, struct vdo_completion *parent) { struct block_map *map = context; + map->current_era_point = map->pending_era_point; - complete_vdo_completion(parent); + vdo_complete_completion(parent); } -/** - * Update the progress of the era in a zone. - * - *

Implements vdo_zone_action. - **/ +/* Implements vdo_zone_action */ static void advance_block_map_zone_era(void *context, zone_count_t zone_number, struct vdo_completion *parent) { - struct block_map_zone *zone = - vdo_get_block_map_zone(context, zone_number); - advance_vdo_page_cache_period(zone->page_cache, - zone->block_map->current_era_point); + struct block_map *map = context; + struct block_map_zone *zone = &map->zones[zone_number]; + + vdo_advance_page_cache_period(zone->page_cache, + map->current_era_point); vdo_advance_zone_tree_period(&zone->tree_zone, - zone->block_map->current_era_point); - finish_vdo_completion(parent, VDO_SUCCESS); + map->current_era_point); + vdo_finish_completion(parent, VDO_SUCCESS); } -/** +/* * Schedule an era advance if necessary. This method should not be called - * directly. Rather, call schedule_vdo_default_action() on the block map's action - * manager. + * directly. Rather, call vdo_schedule_default_action() on the block map's + * action manager. * - *

Implements vdo_action_scheduler. - **/ + * Implements vdo_action_scheduler. + */ static bool schedule_era_advance(void *context) { struct block_map *map = context; + if (map->current_era_point == map->pending_era_point) { return false; } - return schedule_vdo_action(map->action_manager, + return vdo_schedule_action(map->action_manager, prepare_for_era_advance, advance_block_map_zone_era, NULL, NULL); } -/** - * Clean up a block_map_zone. - * - * @param zone The zone to uninitialize - **/ static void uninitialize_block_map_zone(struct block_map_zone *zone) { vdo_uninitialize_block_map_tree_zone(&zone->tree_zone); - free_vdo_page_cache(UDS_FORGET(zone->page_cache)); + vdo_free_page_cache(UDS_FORGET(zone->page_cache)); } -/**********************************************************************/ -void free_vdo_block_map(struct block_map *map) +void vdo_free_block_map(struct block_map *map) { zone_count_t zone; @@ -255,14 +221,16 @@ void free_vdo_block_map(struct block_map *map) } vdo_abandon_block_map_growth(map); - free_vdo_forest(UDS_FORGET(map->forest)); + vdo_free_forest(UDS_FORGET(map->forest)); UDS_FREE(UDS_FORGET(map->action_manager)); UDS_FREE(map); } -/**********************************************************************/ -int decode_vdo_block_map(struct block_map_state_2_0 state, +/* + * @journal may be NULL. + */ +int vdo_decode_block_map(struct block_map_state_2_0 state, block_count_t logical_blocks, const struct thread_config *thread_config, struct vdo *vdo, @@ -301,13 +269,13 @@ int decode_vdo_block_map(struct block_map_state_2_0 state, map->journal = journal; map->nonce = nonce; - result = make_vdo_forest(map, map->entry_count); + result = vdo_make_forest(map, map->entry_count); if (result != VDO_SUCCESS) { - free_vdo_block_map(map); + vdo_free_block_map(map); return result; } - replace_vdo_forest(map); + vdo_replace_forest(map); map->zone_count = thread_config->logical_zone_count; for (zone = 0; zone < map->zone_count; zone++) { @@ -319,21 +287,21 @@ int decode_vdo_block_map(struct block_map_state_2_0 state, cache_size, maximum_age); if (result != VDO_SUCCESS) { - free_vdo_block_map(map); + vdo_free_block_map(map); return result; } } - result = make_vdo_action_manager(map->zone_count, + result = vdo_make_action_manager(map->zone_count, get_block_map_zone_thread_id, - get_vdo_recovery_journal_thread_id(journal), + vdo_get_recovery_journal_thread_id(journal), map, schedule_era_advance, vdo, &map->action_manager); if (result != VDO_SUCCESS) { - free_vdo_block_map(map); + vdo_free_block_map(map); return result; } @@ -341,13 +309,14 @@ int decode_vdo_block_map(struct block_map_state_2_0 state, return VDO_SUCCESS; } -/**********************************************************************/ -struct block_map_state_2_0 record_vdo_block_map(const struct block_map *map) +struct block_map_state_2_0 vdo_record_block_map(const struct block_map *map) { struct block_map_state_2_0 state = { .flat_page_origin = VDO_BLOCK_MAP_FLAT_PAGE_ORIGIN, - // This is the flat page count, which has turned out to always - // be 0. + /* + * This is the flat page count, which has turned out to always + * be 0. + */ .flat_page_count = 0, .root_origin = map->root_origin, .root_count = map->root_count, @@ -356,64 +325,74 @@ struct block_map_state_2_0 record_vdo_block_map(const struct block_map *map) return state; } -/**********************************************************************/ -void initialize_vdo_block_map_from_journal(struct block_map *map, +/* + * The block map needs to know the journals' sequence number to initialize + * the eras. + */ +void vdo_initialize_block_map_from_journal(struct block_map *map, struct recovery_journal *journal) { zone_count_t zone = 0; map->current_era_point = - get_vdo_recovery_journal_current_sequence_number(journal); + vdo_get_recovery_journal_current_sequence_number(journal); map->pending_era_point = map->current_era_point; for (zone = 0; zone < map->zone_count; zone++) { vdo_set_tree_zone_initial_period(&map->zones[zone].tree_zone, map->current_era_point); - set_vdo_page_cache_initial_period(map->zones[zone].page_cache, + vdo_set_page_cache_initial_period(map->zones[zone].page_cache, map->current_era_point); } } -/**********************************************************************/ +/* + * Compute the logical zone for the LBN of a data vio. + */ zone_count_t vdo_compute_logical_zone(struct data_vio *data_vio) { - struct block_map *map = get_block_map(get_vdo_from_data_vio(data_vio)); + struct block_map *map = vdo_from_data_vio(data_vio)->block_map; struct tree_lock *tree_lock = &data_vio->tree_lock; + page_number_t page_number - = vdo_compute_page_number(data_vio->logical.lbn); + = data_vio->logical.lbn / VDO_BLOCK_MAP_ENTRIES_PER_PAGE; tree_lock->tree_slots[0].page_index = page_number; tree_lock->root_index = page_number % map->root_count; return (tree_lock->root_index % map->zone_count); } -/**********************************************************************/ +/* + * Compute the block map slot in which the block map entry for a data_vio + * resides and cache that in the data_vio. + * @thread_id: The thread on which to run the callback + */ void vdo_find_block_map_slot(struct data_vio *data_vio, vdo_action *callback, thread_id_t thread_id) { - struct block_map *map = get_block_map(get_vdo_from_data_vio(data_vio)); + struct block_map *map = vdo_from_data_vio(data_vio)->block_map; struct tree_lock *tree_lock = &data_vio->tree_lock; struct block_map_tree_slot *slot = &tree_lock->tree_slots[0]; + data_vio->last_async_operation = VIO_ASYNC_OP_FIND_BLOCK_MAP_SLOT; + if (data_vio->logical.lbn >= map->entry_count) { finish_data_vio(data_vio, VDO_OUT_OF_RANGE); return; } - slot->block_map_slot.slot = vdo_compute_slot(data_vio->logical.lbn); + slot->block_map_slot.slot + = data_vio->logical.lbn % VDO_BLOCK_MAP_ENTRIES_PER_PAGE; tree_lock->callback = callback; tree_lock->thread_id = thread_id; vdo_lookup_block_map_pbn(data_vio); } -/**********************************************************************/ -block_count_t vdo_get_number_of_block_map_entries(const struct block_map *map) -{ - return map->entry_count; -} - -/**********************************************************************/ -void advance_vdo_block_map_era(struct block_map *map, +/* + * Update the block map era information for a newly finished journal block. + * This method must be called from the journal zone thread. + */ +void vdo_advance_block_map_era(struct block_map *map, sequence_number_t recovery_block_number) { if (map == NULL) { @@ -421,77 +400,72 @@ void advance_vdo_block_map_era(struct block_map *map, } map->pending_era_point = recovery_block_number; - schedule_vdo_default_action(map->action_manager); + vdo_schedule_default_action(map->action_manager); } -/**********************************************************************/ -void vdo_check_for_drain_complete(struct block_map_zone *zone) +void vdo_block_map_check_for_drain_complete(struct block_map_zone *zone) { - if (is_vdo_state_draining(&zone->state) && + if (vdo_is_state_draining(&zone->state) && !vdo_is_tree_zone_active(&zone->tree_zone) && - !is_vdo_page_cache_active(zone->page_cache)) { - finish_vdo_draining_with_result(&zone->state, + !vdo_is_page_cache_active(zone->page_cache)) { + vdo_finish_draining_with_result(&zone->state, (vdo_is_read_only(zone->read_only_notifier) ? VDO_READ_ONLY : VDO_SUCCESS)); } } -/** - * Initiate a drain of the trees and page cache of a block map zone. - * +/* * Implements vdo_admin_initiator - **/ + */ static void initiate_drain(struct admin_state *state) { struct block_map_zone *zone = container_of(state, struct block_map_zone, state); vdo_drain_zone_trees(&zone->tree_zone); - drain_vdo_page_cache(zone->page_cache); - vdo_check_for_drain_complete(zone); + vdo_drain_page_cache(zone->page_cache); + vdo_block_map_check_for_drain_complete(zone); } -/** - * Drain a zone of the block map. - * - *

Implements vdo_zone_action. - **/ +/* + * Implements vdo_zone_action. + */ static void drain_zone(void *context, zone_count_t zone_number, struct vdo_completion *parent) { - struct block_map_zone *zone = vdo_get_block_map_zone(context, zone_number); - start_vdo_draining(&zone->state, - get_current_vdo_manager_operation(zone->block_map->action_manager), + struct block_map *map = context; + struct block_map_zone *zone = &map->zones[zone_number]; + + vdo_start_draining(&zone->state, + vdo_get_current_manager_operation(map->action_manager), parent, initiate_drain); } -/**********************************************************************/ -void drain_vdo_block_map(struct block_map *map, +void vdo_drain_block_map(struct block_map *map, const struct admin_state_code *operation, struct vdo_completion *parent) { - schedule_vdo_operation(map->action_manager, operation, NULL, + vdo_schedule_operation(map->action_manager, operation, NULL, drain_zone, NULL, parent); } -/** - * Resume a zone of the block map. - * - *

Implements vdo_zone_action. - **/ +/* + * Implements vdo_zone_action. + */ static void resume_block_map_zone(void *context, zone_count_t zone_number, struct vdo_completion *parent) { - struct block_map_zone *zone = vdo_get_block_map_zone(context, zone_number); - finish_vdo_completion(parent, resume_vdo_if_quiescent(&zone->state)); + struct block_map *map = context; + struct block_map_zone *zone = &map->zones[zone_number]; + + vdo_finish_completion(parent, vdo_resume_if_quiescent(&zone->state)); } -/**********************************************************************/ -void resume_vdo_block_map(struct block_map *map, struct vdo_completion *parent) +void vdo_resume_block_map(struct block_map *map, struct vdo_completion *parent) { - schedule_vdo_operation(map->action_manager, + vdo_schedule_operation(map->action_manager, VDO_ADMIN_STATE_RESUMING, NULL, resume_block_map_zone, @@ -499,7 +473,9 @@ void resume_vdo_block_map(struct block_map *map, struct vdo_completion *parent) parent); } -/**********************************************************************/ +/* + * Allocate an expanded collection of trees, for a future growth. + */ int vdo_prepare_to_grow_block_map(struct block_map *map, block_count_t new_logical_blocks) { @@ -516,30 +492,24 @@ int vdo_prepare_to_grow_block_map(struct block_map *map, return VDO_SUCCESS; } - return make_vdo_forest(map, new_logical_blocks); + return vdo_make_forest(map, new_logical_blocks); } -/**********************************************************************/ -block_count_t vdo_get_new_entry_count(struct block_map *map) -{ - return map->next_entry_count; -} - -/** - * Grow the block map by replacing the forest with the one which was prepared. - * +/* * Implements vdo_action_preamble - **/ + */ static void grow_forest(void *context, struct vdo_completion *completion) { - replace_vdo_forest(context); - complete_vdo_completion(completion); + vdo_replace_forest(context); + vdo_complete_completion(completion); } -/**********************************************************************/ -void grow_vdo_block_map(struct block_map *map, struct vdo_completion *parent) +/* + * Requires vdo_prepare_to_grow_block_map() to have been previously called. + **/ +void vdo_grow_block_map(struct block_map *map, struct vdo_completion *parent) { - schedule_vdo_operation(map->action_manager, + vdo_schedule_operation(map->action_manager, VDO_ADMIN_STATE_SUSPENDED_OPERATION, grow_forest, NULL, @@ -547,83 +517,66 @@ void grow_vdo_block_map(struct block_map *map, struct vdo_completion *parent) parent); } -/**********************************************************************/ void vdo_abandon_block_map_growth(struct block_map *map) { - abandon_vdo_forest(map); + vdo_abandon_forest(map); } -/** - * Finish processing a block map get or put operation. This function releases - * the page completion and then continues the requester. - * - * @param completion The completion for the page fetch - * @param result The result of the block map operation - **/ +/* + * Release the page completion and then continue the requester. + */ static inline void finish_processing_page(struct vdo_completion *completion, int result) { struct vdo_completion *parent = completion->parent; - release_vdo_page_completion(completion); - continue_vdo_completion(parent, result); + + vdo_release_page_completion(completion); + vdo_continue_completion(parent, result); } -/** - * Handle an error fetching a page from the cache. This error handler is - * registered in setup_mapped_block(). - * - * @param completion The page completion which got an error - **/ static void handle_page_error(struct vdo_completion *completion) { finish_processing_page(completion, completion->result); } -/** - * Get the mapping page for a get/put mapped block operation and dispatch to - * the appropriate handler. - * - * @param data_vio The data_vio - * @param modifiable Whether we intend to modify the mapping - * @param action The handler to process the mapping page - **/ +/* + * Fetch the mapping page for a block map update, and call the + * provided handler when fetched. + */ static void -setup_mapped_block(struct data_vio *data_vio, bool modifiable, +fetch_mapping_page(struct data_vio *data_vio, bool modifiable, vdo_action *action) { - struct block_map_zone *zone = - get_vdo_logical_zone_block_map(data_vio->logical.zone); - if (is_vdo_state_draining(&zone->state)) { + struct block_map_zone *zone = data_vio->logical.zone->block_map_zone; + + if (vdo_is_state_draining(&zone->state)) { finish_data_vio(data_vio, VDO_SHUTTING_DOWN); return; } - init_vdo_page_completion(&data_vio->page_completion, + vdo_init_page_completion(&data_vio->page_completion, zone->page_cache, data_vio->tree_lock.tree_slots[0].block_map_slot.pbn, modifiable, data_vio_as_completion(data_vio), action, handle_page_error); - get_vdo_page(&data_vio->page_completion.completion); + vdo_get_page(&data_vio->page_completion.completion); } -/** - * Decode and validate a block map entry and attempt to use it to set the - * mapped location of a data_vio. - * - * @param data_vio The data_vio to update with the map entry - * @param entry The block map entry for the logical block +/* + * Decode and validate a block map entry, and set the mapped location of + * a data_vio. * * @return VDO_SUCCESS or VDO_BAD_MAPPING if the map entry is invalid * or an error code for any other failure - **/ + */ static int __must_check -set_mapped_entry(struct data_vio *data_vio, - const struct block_map_entry *entry) +set_mapped_location(struct data_vio *data_vio, + const struct block_map_entry *entry) { - // Unpack the PBN for logging purposes even if the entry is invalid. - struct data_location mapped = unpack_vdo_block_map_entry(entry); + /* Unpack the PBN for logging purposes even if the entry is invalid. */ + struct data_location mapped = vdo_unpack_block_map_entry(entry); if (vdo_is_valid_location(&mapped)) { int result = set_data_vio_mapped_location(data_vio, mapped.pbn, @@ -639,28 +592,34 @@ set_mapped_entry(struct data_vio *data_vio, } } - // Log the corruption even if we wind up ignoring it for write VIOs, - // converting all cases to VDO_BAD_MAPPING. + /* + * Log the corruption even if we wind up ignoring it for write VIOs, + * converting all cases to VDO_BAD_MAPPING. + */ uds_log_error_strerror(VDO_BAD_MAPPING, "PBN %llu with state %u read from the block map was invalid", (unsigned long long) mapped.pbn, mapped.state); - // A read VIO has no option but to report the bad mapping--reading - // zeros would be hiding known data loss. + /* + * A read VIO has no option but to report the bad mapping--reading + * zeros would be hiding known data loss. + */ if (is_read_data_vio(data_vio)) { return VDO_BAD_MAPPING; } - // A write VIO only reads this mapping to decref the old block. Treat - // this as an unmapped entry rather than fail the write. + /* + * A write VIO only reads this mapping to decref the old block. Treat + * this as an unmapped entry rather than fail the write. + */ clear_data_vio_mapped_location(data_vio); return VDO_SUCCESS; } -/** +/* * This callback is registered in vdo_get_mapped_block(). - **/ + */ static void get_mapping_from_fetched_page(struct vdo_completion *completion) { int result; @@ -674,7 +633,7 @@ static void get_mapping_from_fetched_page(struct vdo_completion *completion) return; } - page = dereference_readable_vdo_page(completion); + page = vdo_dereference_readable_page(completion); result = ASSERT(page != NULL, "page available"); if (result != VDO_SUCCESS) { finish_processing_page(completion, result); @@ -684,59 +643,55 @@ static void get_mapping_from_fetched_page(struct vdo_completion *completion) tree_slot = &data_vio->tree_lock.tree_slots[0]; entry = &page->entries[tree_slot->block_map_slot.slot]; - result = set_mapped_entry(data_vio, entry); + result = set_mapped_location(data_vio, entry); finish_processing_page(completion, result); } -/**********************************************************************/ -void update_vdo_block_map_page(struct block_map_page *page, +void vdo_update_block_map_page(struct block_map_page *page, struct data_vio *data_vio, physical_block_number_t pbn, enum block_mapping_state mapping_state, sequence_number_t *recovery_lock) { - struct block_map_zone *zone = - get_vdo_logical_zone_block_map(data_vio->logical.zone); + struct block_map_zone *zone = data_vio->logical.zone->block_map_zone; struct block_map *block_map = zone->block_map; struct recovery_journal *journal = block_map->journal; sequence_number_t old_locked, new_locked; - // Encode the new mapping. + /* Encode the new mapping. */ struct tree_lock *tree_lock = &data_vio->tree_lock; slot_number_t slot = tree_lock->tree_slots[tree_lock->height].block_map_slot.slot; - page->entries[slot] = pack_vdo_pbn(pbn, mapping_state); + page->entries[slot] = vdo_pack_pbn(pbn, mapping_state); - // Adjust references (locks) on the recovery journal blocks. + /* Adjust references on the recovery journal blocks. */ old_locked = *recovery_lock; new_locked = data_vio->recovery_sequence_number; if ((old_locked == 0) || (old_locked > new_locked)) { - // Acquire a lock on the newly referenced journal block. - acquire_vdo_recovery_journal_block_reference(journal, + vdo_acquire_recovery_journal_block_reference(journal, new_locked, - ZONE_TYPE_LOGICAL, + VDO_ZONE_TYPE_LOGICAL, zone->zone_number); - // If the block originally held a newer lock, release it. if (old_locked > 0) { - release_vdo_recovery_journal_block_reference(journal, + vdo_release_recovery_journal_block_reference(journal, old_locked, - ZONE_TYPE_LOGICAL, + VDO_ZONE_TYPE_LOGICAL, zone->zone_number); } *recovery_lock = new_locked; } - // Release the transferred lock from the data_vio. + /* + * FIXME: explain this more + * Release the transferred lock from the data_vio. + */ vdo_release_journal_per_entry_lock_from_other_zone(journal, new_locked); data_vio->recovery_sequence_number = 0; } -/** - * This callback is registered in vdo_put_mapped_block(). - **/ static void put_mapping_in_fetched_page(struct vdo_completion *completion) { struct data_vio *data_vio = as_data_vio(completion->parent); @@ -750,57 +705,63 @@ static void put_mapping_in_fetched_page(struct vdo_completion *completion) return; } - page = dereference_writable_vdo_page(completion); + page = vdo_dereference_writable_page(completion); result = ASSERT(page != NULL, "page available"); if (result != VDO_SUCCESS) { finish_processing_page(completion, result); return; } - context = get_vdo_page_completion_context(completion); + context = vdo_get_page_completion_context(completion); old_lock = context->recovery_lock; - update_vdo_block_map_page(page, + vdo_update_block_map_page(page, data_vio, data_vio->new_mapped.pbn, data_vio->new_mapped.state, &context->recovery_lock); - mark_completed_vdo_page_dirty(completion, old_lock, + vdo_mark_completed_page_dirty(completion, old_lock, context->recovery_lock); finish_processing_page(completion, VDO_SUCCESS); } -/**********************************************************************/ +/* + * Read a stored block mapping into a data_vio. + */ void vdo_get_mapped_block(struct data_vio *data_vio) { if (data_vio->tree_lock.tree_slots[0].block_map_slot.pbn == VDO_ZERO_BLOCK) { - // We know that the block map page for this LBN has not been - // allocated, so the block must be unmapped. + /* + * We know that the block map page for this LBN has not been + * allocated, so the block must be unmapped. + */ clear_data_vio_mapped_location(data_vio); continue_data_vio(data_vio, VDO_SUCCESS); return; } - setup_mapped_block(data_vio, false, get_mapping_from_fetched_page); + fetch_mapping_page(data_vio, false, get_mapping_from_fetched_page); } -/**********************************************************************/ +/* + * Update a stored block mapping to reflect a data_vio's new mapping. + */ void vdo_put_mapped_block(struct data_vio *data_vio) { - setup_mapped_block(data_vio, true, put_mapping_in_fetched_page); + fetch_mapping_page(data_vio, true, put_mapping_in_fetched_page); } -/**********************************************************************/ -struct block_map_statistics get_vdo_block_map_statistics(struct block_map *map) +struct block_map_statistics vdo_get_block_map_statistics(struct block_map *map) { zone_count_t zone = 0; struct block_map_statistics totals; + memset(&totals, 0, sizeof(struct block_map_statistics)); for (zone = 0; zone < map->zone_count; zone++) { struct vdo_page_cache *cache = map->zones[zone].page_cache; struct block_map_statistics stats = - get_vdo_page_cache_statistics(cache); + vdo_get_page_cache_statistics(cache); totals.dirty_pages += stats.dirty_pages; totals.clean_pages += stats.clean_pages; diff --git a/vdo/block-map.h b/vdo/block-map.h new file mode 100644 index 00000000..e421e447 --- /dev/null +++ b/vdo/block-map.h @@ -0,0 +1,139 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright Red Hat + */ + +#ifndef BLOCK_MAP_H +#define BLOCK_MAP_H + +#include "admin-state.h" +#include "block-map-entry.h" +#include "block-map-format.h" +#include "block-map-page.h" +#include "block-map-tree.h" +#include "completion.h" +#include "dirty-lists.h" +#include "header.h" +#include "int-map.h" +#include "statistics.h" +#include "types.h" +#include "vdo-layout.h" +#include "vdo-page-cache.h" +#include "vio-pool.h" + +/* + * The per-zone fields used by the block map tree. + */ +struct block_map_tree_zone { + struct block_map_zone *map_zone; + /* Dirty tree pages, by era*/ + struct dirty_lists *dirty_lists; + vio_count_t active_lookups; + struct int_map *loading_pages; + struct vio_pool *vio_pool; + /* The tree page which has issued or will be issuing a flush */ + struct tree_page *flusher; + struct wait_queue flush_waiters; + /* The generation after the most recent flush */ + uint8_t generation; + uint8_t oldest_generation; + /* The counts of dirty pages in each generation */ + uint32_t dirty_page_counts[256]; +}; + +struct block_map_zone { + zone_count_t zone_number; + thread_id_t thread_id; + struct block_map *block_map; + struct read_only_notifier *read_only_notifier; + struct vdo_page_cache *page_cache; + struct block_map_tree_zone tree_zone; + struct admin_state state; +}; + +struct block_map { + struct action_manager *action_manager; + /* The absolute PBN of the first root of the tree part of the block map */ + physical_block_number_t root_origin; + block_count_t root_count; + + /* The era point we are currently distributing to the zones */ + sequence_number_t current_era_point; + /* The next era point */ + sequence_number_t pending_era_point; + + /* The number of entries in block map */ + block_count_t entry_count; + nonce_t nonce; + struct recovery_journal *journal; + + /* The trees for finding block map pages */ + struct forest *forest; + /* The expanded trees awaiting growth */ + struct forest *next_forest; + /* The number of entries after growth */ + block_count_t next_entry_count; + + zone_count_t zone_count; + struct block_map_zone zones[]; +}; + +int __must_check +vdo_decode_block_map(struct block_map_state_2_0 state, + block_count_t logical_blocks, + const struct thread_config *thread_config, + struct vdo *vdo, + struct read_only_notifier *read_only_notifier, + struct recovery_journal *journal, + nonce_t nonce, + page_count_t cache_size, + block_count_t maximum_age, + struct block_map **map_ptr); + +void vdo_drain_block_map(struct block_map *map, + const struct admin_state_code *operation, + struct vdo_completion *parent); + +void vdo_resume_block_map(struct block_map *map, struct vdo_completion *parent); + +int __must_check +vdo_prepare_to_grow_block_map(struct block_map *map, + block_count_t new_logical_blocks); + +void vdo_grow_block_map(struct block_map *map, struct vdo_completion *parent); + +void vdo_abandon_block_map_growth(struct block_map *map); + +void vdo_free_block_map(struct block_map *map); + +struct block_map_state_2_0 __must_check +vdo_record_block_map(const struct block_map *map); + +void vdo_initialize_block_map_from_journal(struct block_map *map, + struct recovery_journal *journal); + +zone_count_t vdo_compute_logical_zone(struct data_vio *data_vio); + +void vdo_find_block_map_slot(struct data_vio *data_vio, + vdo_action *callback, + thread_id_t thread_id); + +void vdo_advance_block_map_era(struct block_map *map, + sequence_number_t recovery_block_number); + +void vdo_block_map_check_for_drain_complete(struct block_map_zone *zone); + +void vdo_update_block_map_page(struct block_map_page *page, + struct data_vio *data_vio, + physical_block_number_t pbn, + enum block_mapping_state mapping_state, + sequence_number_t *recovery_lock); + +void vdo_get_mapped_block(struct data_vio *data_vio); + +void vdo_put_mapped_block(struct data_vio *data_vio); + +struct block_map_statistics __must_check +vdo_get_block_map_statistics(struct block_map *map); + +#endif /* BLOCK_MAP_H */ diff --git a/vdo/block-mapping-state.h b/vdo/block-mapping-state.h new file mode 100644 index 00000000..6880ba4e --- /dev/null +++ b/vdo/block-mapping-state.h @@ -0,0 +1,48 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright Red Hat + */ + +#ifndef BLOCK_MAPPING_STATE_H +#define BLOCK_MAPPING_STATE_H + +#include "type-defs.h" + +/* + * Four bits of each five-byte block map entry contain a mapping state value + * used to distinguish unmapped or trimmed logical blocks (which are treated + * as mapped to the zero block) from entries that have been mapped to a + * physical block, including the zero block. + * + * FIXME: these should maybe be defines. + */ +enum block_mapping_state { + VDO_MAPPING_STATE_UNMAPPED = 0, /* Must be zero to be the default value */ + VDO_MAPPING_STATE_UNCOMPRESSED = 1, /* A normal (uncompressed) block */ + VDO_MAPPING_STATE_COMPRESSED_BASE = 2, /* Compressed in slot 0 */ + VDO_MAPPING_STATE_COMPRESSED_MAX = 15, /* Compressed in slot 13 */ +}; + +enum { + VDO_MAX_COMPRESSION_SLOTS = (VDO_MAPPING_STATE_COMPRESSED_MAX + - VDO_MAPPING_STATE_COMPRESSED_BASE + 1), +}; + +static inline enum block_mapping_state vdo_get_state_for_slot(byte slot_number) +{ + return (slot_number + VDO_MAPPING_STATE_COMPRESSED_BASE); +} + +static inline byte +vdo_get_slot_from_state(enum block_mapping_state mapping_state) +{ + return (mapping_state - VDO_MAPPING_STATE_COMPRESSED_BASE); +} + +static inline bool +vdo_is_state_compressed(const enum block_mapping_state mapping_state) +{ + return (mapping_state > VDO_MAPPING_STATE_UNCOMPRESSED); +} + +#endif /* BLOCK_MAPPING_STATE_H */ diff --git a/vdo/blockAllocator.h b/vdo/blockAllocator.h deleted file mode 100644 index df6ab6ec..00000000 --- a/vdo/blockAllocator.h +++ /dev/null @@ -1,299 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/blockAllocator.h#12 $ - */ - -#ifndef BLOCK_ALLOCATOR_H -#define BLOCK_ALLOCATOR_H - -#include "statistics.h" -#include "types.h" -#include "vioPool.h" -#include "waitQueue.h" - -/** - * Create a block allocator. - * - * @param [in] depot The slab depot for this allocator - * @param [in] zone_number The physical zone number for this allocator - * @param [in] thread_id The thread ID for this allocator's zone - * @param [in] nonce The nonce of the VDO - * @param [in] vio_pool_size The size of the VIO pool - * @param [in] vdo The VDO - * @param [in] read_only_notifier The context for entering read-only mode - * @param [out] allocator_ptr A pointer to hold the allocator - * - * @return A success or error code - **/ -int __must_check -make_vdo_block_allocator(struct slab_depot *depot, - zone_count_t zone_number, - thread_id_t thread_id, - nonce_t nonce, - block_count_t vio_pool_size, - struct vdo *vdo, - struct read_only_notifier *read_only_notifier, - struct block_allocator **allocator_ptr); - -/** - * Destroy a block allocator. - * - * @param allocator The allocator to destroy - **/ -void free_vdo_block_allocator(struct block_allocator *allocator); - -/** - * Queue a slab for allocation or scrubbing. - * - * @param slab The slab to queue - **/ -void queue_vdo_slab(struct vdo_slab *slab); - -/** - * Update the block allocator to reflect an increment or decrement of the free - * block count in a slab. This adjusts the allocated block count and - * reprioritizes the slab when appropriate. - * - * @param slab The slab whose free block count changed - * @param increment True if the free block count went up by one, - * false if it went down by one - **/ -void adjust_vdo_free_block_count(struct vdo_slab *slab, bool increment); - -/** - * Allocate a physical block. - * - * The block allocated will have a provisional reference and the reference - * must be either confirmed with a subsequent increment or vacated with a - * subsequent decrement of the reference count. - * - * @param [in] allocator The block allocator - * @param [out] block_number_ptr A pointer to receive the allocated block - * number - * - * @return UDS_SUCCESS or an error code - **/ -int __must_check allocate_vdo_block(struct block_allocator *allocator, - physical_block_number_t *block_number_ptr); - -/** - * Release an unused provisional reference. - * - * @param allocator The block allocator - * @param pbn The block to dereference - * @param why Why the block was referenced (for logging) - **/ -void release_vdo_block_reference(struct block_allocator *allocator, - physical_block_number_t pbn, - const char *why); - -/** - * Get the number of allocated blocks, which is the total number of - * blocks in all slabs that have a non-zero reference count. - * - * @param allocator The block allocator - * - * @return The number of blocks with a non-zero reference count - **/ -block_count_t __must_check -get_vdo_allocated_blocks(const struct block_allocator *allocator); - -/** - * Get the number of unrecovered slabs. - * - * @param allocator The block allocator - * - * @return The number of slabs that are unrecovered - **/ -block_count_t __must_check -get_vdo_unrecovered_slab_count(const struct block_allocator *allocator); - -/** - * Load the state of an allocator from disk. - * - *

Implements vdo_zone_action. - **/ -void load_vdo_block_allocator(void *context, - zone_count_t zone_number, - struct vdo_completion *parent); - -/** - * Inform a block allocator that its slab journals have been recovered from the - * recovery journal. - * - * @param allocator The allocator to inform - * @param result The result of the recovery operation - **/ -void notify_vdo_slab_journals_are_recovered(struct block_allocator *allocator, - int result); - -/** - * Prepare the block allocator to come online and start allocating blocks. - * - *

Implements vdo_zone_action. - **/ -void prepare_vdo_block_allocator_to_allocate(void *context, - zone_count_t zone_number, - struct vdo_completion *parent); - -/** - * Register a slab with the allocator, ready for use. - * - * @param allocator The allocator to use - * @param slab The slab in question - **/ -void register_vdo_slab_with_allocator(struct block_allocator *allocator, - struct vdo_slab *slab); - -/** - * Register the new slabs belonging to this allocator. - * - *

Implements vdo_zone_action. - **/ -void register_new_vdo_slabs_for_allocator(void *context, - zone_count_t zone_number, - struct vdo_completion *parent); - -/** - * Drain all allocator I/O. Depending upon the type of drain, some or all - * dirty metadata may be written to disk. The type of drain will be determined - * from the state of the allocator's depot. - * - *

Implements vdo_zone_action. - **/ -void drain_vdo_block_allocator(void *context, - zone_count_t zone_number, - struct vdo_completion *parent); - -/** - * Resume a quiescent allocator. - * - *

Implements vdo_zone_action. - **/ -void resume_vdo_block_allocator(void *context, - zone_count_t zone_number, - struct vdo_completion *parent); - -/** - * Request a commit of all dirty tail blocks which are locking a given recovery - * journal block. - * - *

Implements vdo_zone_action. - **/ -void release_vdo_tail_block_locks(void *context, - zone_count_t zone_number, - struct vdo_completion *parent); - -/** - * Get the slab summary zone for an allocator. - * - * @param allocator The allocator - * - * @return The slab_summary_zone for that allocator - **/ -struct slab_summary_zone * __must_check -get_vdo_slab_summary_zone(const struct block_allocator *allocator); - -/** - * Acquire a VIO from a block allocator's VIO pool (asynchronous). - * - * @param allocator The allocator from which to get a VIO - * @param waiter The object requesting the VIO - * - * @return VDO_SUCCESS or an error - **/ -int __must_check -acquire_vdo_block_allocator_vio(struct block_allocator *allocator, - struct waiter *waiter); - -/** - * Return a VIO to a block allocator's VIO pool - * - * @param allocator The block allocator which owns the VIO - * @param entry The VIO being returned - **/ -void return_vdo_block_allocator_vio(struct block_allocator *allocator, - struct vio_pool_entry *entry); - -/** - * Initiate scrubbing all unrecovered slabs. - * - *

Implements vdo_zone_action. - **/ -void scrub_all_unrecovered_vdo_slabs_in_zone(void *context, - zone_count_t zone_number, - struct vdo_completion *parent); - -/** - * Queue a waiter for a clean slab. - * - * @param allocator The allocator to wait on - * @param waiter The waiter - * - * @return VDO_SUCCESS if the waiter was queued, VDO_NO_SPACE if there are no - * slabs to scrub, and some other error otherwise - **/ -int __must_check enqueue_for_clean_vdo_slab(struct block_allocator *allocator, - struct waiter *waiter); - -/** - * Increase the scrubbing priority of a slab. - * - * @param slab The slab - **/ -void increase_vdo_slab_scrubbing_priority(struct vdo_slab *slab); - -/** - * Get the statistics for this allocator. - * - * @param allocator The allocator to query - * - * @return A copy of the current statistics for the allocator - **/ -struct block_allocator_statistics __must_check -get_vdo_block_allocator_statistics(const struct block_allocator *allocator); - -/** - * Get the aggregated slab journal statistics for the slabs in this allocator. - * - * @param allocator The allocator to query - * - * @return A copy of the current statistics for the allocator - **/ -struct slab_journal_statistics __must_check -get_vdo_slab_journal_statistics(const struct block_allocator *allocator); - -/** - * Get the cumulative ref_counts statistics for the slabs in this allocator. - * - * @param allocator The allocator to query - * - * @return A copy of the current statistics for the allocator - **/ -struct ref_counts_statistics __must_check -get_vdo_ref_counts_statistics(const struct block_allocator *allocator); - -/** - * Dump information about a block allocator to the log for debugging. - * - * @param allocator The allocator to dump - **/ -void dump_vdo_block_allocator(const struct block_allocator *allocator); - -#endif // BLOCK_ALLOCATOR_H diff --git a/vdo/blockAllocatorInternals.h b/vdo/blockAllocatorInternals.h deleted file mode 100644 index 390e1ec6..00000000 --- a/vdo/blockAllocatorInternals.h +++ /dev/null @@ -1,127 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/blockAllocatorInternals.h#3 $ - */ - -#ifndef BLOCK_ALLOCATOR_INTERNALS_H -#define BLOCK_ALLOCATOR_INTERNALS_H - -#include "adminState.h" -#include "blockAllocator.h" -#include "priorityTable.h" -#include "slabScrubber.h" -#include "statistics.h" -#include "vioPool.h" - -enum { - /* - * The number of vios in the vio pool is proportional to the throughput - * of the VDO. - */ - VIO_POOL_SIZE = 128, -}; - -enum block_allocator_drain_step { - VDO_DRAIN_ALLOCATOR_START = 0, - VDO_DRAIN_ALLOCATOR_STEP_SCRUBBER, - VDO_DRAIN_ALLOCATOR_STEP_SLABS, - VDO_DRAIN_ALLOCATOR_STEP_SUMMARY, - VDO_DRAIN_ALLOCATOR_STEP_FINISHED, -}; - -/** - * A sub-structure for applying actions in parallel to all an allocator's - * slabs. - **/ -struct slab_actor { - /** The number of slabs performing a slab action */ - slab_count_t slab_action_count; - /** The method to call when a slab action has been completed by all - * slabs */ - vdo_action *callback; -}; - -struct block_allocator { - struct vdo_completion completion; - /** The slab depot for this allocator */ - struct slab_depot *depot; - /** The slab summary zone for this allocator */ - struct slab_summary_zone *summary; - /** The notifier for entering read-only mode */ - struct read_only_notifier *read_only_notifier; - /** The nonce of the VDO */ - nonce_t nonce; - /** The physical zone number of this allocator */ - zone_count_t zone_number; - /** The thread ID for this allocator's physical zone */ - thread_id_t thread_id; - /** The number of slabs in this allocator */ - slab_count_t slab_count; - /** The number of the last slab owned by this allocator */ - slab_count_t last_slab; - /** The reduced priority level used to preserve unopened slabs */ - unsigned int unopened_slab_priority; - /** The state of this allocator */ - struct admin_state state; - /** The actor for applying an action to all slabs */ - struct slab_actor slab_actor; - - /** The slab from which blocks are currently being allocated */ - struct vdo_slab *open_slab; - /** A priority queue containing all slabs available for allocation */ - struct priority_table *prioritized_slabs; - /** The slab scrubber */ - struct slab_scrubber *slab_scrubber; - /** What phase of the close operation the allocator is to perform */ - enum block_allocator_drain_step drain_step; - - /* - * These statistics are all mutated only by the physical zone thread, - * but are read by other threads when gathering statistics for the - * entire depot. - */ - /** - * The count of allocated blocks in this zone. Not in - * block_allocator_statistics for historical reasons. - **/ - uint64_t allocated_blocks; - /** Statistics for this block allocator */ - struct block_allocator_statistics statistics; - /** Cumulative statistics for the slab journals in this zone */ - struct slab_journal_statistics slab_journal_statistics; - /** Cumulative statistics for the ref_counts in this zone */ - struct ref_counts_statistics ref_counts_statistics; - - /** - * This is the head of a queue of slab journals which have entries in - * their tail blocks which have not yet started to commit. When the - * recovery journal is under space pressure, slab journals which have - * uncommitted entries holding a lock on the recovery journal head are - * forced to commit their blocks early. This list is kept in order, - * with the tail containing the slab journal holding the most recent - * recovery journal lock. - **/ - struct list_head dirty_slab_journals; - - /** The vio pool for reading and writing block allocator metadata */ - struct vio_pool *vio_pool; -}; - - -#endif // BLOCK_ALLOCATOR_INTERNALS_H diff --git a/vdo/blockMap.h b/vdo/blockMap.h deleted file mode 100644 index feb3737d..00000000 --- a/vdo/blockMap.h +++ /dev/null @@ -1,246 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/blockMap.h#15 $ - */ - -#ifndef BLOCK_MAP_H -#define BLOCK_MAP_H - -#include "adminState.h" -#include "blockMapEntry.h" -#include "blockMapFormat.h" -#include "blockMapPage.h" -#include "completion.h" -#include "fixedLayout.h" -#include "statistics.h" -#include "types.h" - -/** - * Make a block map and configure it with the state read from the super block. - * - * @param [in] state The block map state from the super block - * @param [in] logical_blocks The number of logical blocks for the VDO - * @param [in] thread_config The thread configuration of the VDO - * @param [in] vdo The vdo - * @param [in] read_only_notifier The read only mode context - * @param [in] journal The recovery journal (may be NULL) - * @param [in] nonce The nonce to distinguish initialized pages - * @param [in] cache_size The block map cache size, in pages - * @param [in] maximum_age The number of journal blocks before a - * dirtied page - * @param [out] map_ptr The pointer to hold the new block map - * - * @return VDO_SUCCESS or an error code - **/ -int __must_check -decode_vdo_block_map(struct block_map_state_2_0 state, - block_count_t logical_blocks, - const struct thread_config *thread_config, - struct vdo *vdo, - struct read_only_notifier *read_only_notifier, - struct recovery_journal *journal, - nonce_t nonce, - page_count_t cache_size, - block_count_t maximum_age, - struct block_map **map_ptr); - -/** - * Quiesce all block map I/O, possibly writing out all dirty metadata. - * - * @param map The block map to drain - * @param operation The type of drain to perform - * @param parent The completion to notify when the drain is complete - **/ -void drain_vdo_block_map(struct block_map *map, - const struct admin_state_code *operation, - struct vdo_completion *parent); - -/** - * Resume I/O for a quiescent block map. - * - * @param map The block map to resume - * @param parent The completion to notify when the resume is complete - **/ -void resume_vdo_block_map(struct block_map *map, struct vdo_completion *parent); - -/** - * Prepare to grow the block map by allocating an expanded collection of trees. - * - * @param map The block map to grow - * @param new_logical_blocks The new logical size of the VDO - * - * @return VDO_SUCCESS or an error - **/ -int __must_check -vdo_prepare_to_grow_block_map(struct block_map *map, - block_count_t new_logical_blocks); - -/** - * Get the logical size to which this block map is prepared to grow. - * - * @param map The block map - * - * @return The new number of entries the block map will be grown to or 0 if - * the block map is not prepared to grow - **/ -block_count_t __must_check vdo_get_new_entry_count(struct block_map *map); - -/** - * Grow a block map on which vdo_prepare_to_grow_block_map() has already been - *called. - * - * @param map The block map to grow - * @param parent The object to notify when the growth is complete - **/ -void grow_vdo_block_map(struct block_map *map, struct vdo_completion *parent); - -/** - * Abandon any preparations which were made to grow this block map. - * - * @param map The map which won't be grown - **/ -void vdo_abandon_block_map_growth(struct block_map *map); - -/** - * Free a block map. - * - * @param map The block map to free - **/ -void free_vdo_block_map(struct block_map *map); - -/** - * Record the state of a block map for encoding in a super block. - * - * @param map The block map to encode - * - * @return The state of the block map - **/ -struct block_map_state_2_0 __must_check -record_vdo_block_map(const struct block_map *map); - -/** - * Obtain any necessary state from the recovery journal that is needed for - * normal block map operation. - * - * @param map The map in question - * @param journal The journal to initialize from - **/ -void initialize_vdo_block_map_from_journal(struct block_map *map, - struct recovery_journal *journal); - -/** - * Get the portion of the block map for a given logical zone. - * - * @param map The map - * @param zone_number The number of the zone - * - * @return The requested block map zone - **/ -struct block_map_zone * __must_check -vdo_get_block_map_zone(struct block_map *map, zone_count_t zone_number); - -/** - * Compute the logical zone on which the entry for a data_vio - * resides - * - * @param data_vio The data_vio - * - * @return The logical zone number for the data_vio - **/ -zone_count_t vdo_compute_logical_zone(struct data_vio *data_vio); - -/** - * Compute the block map slot in which the block map entry for a data_vio - * resides, and cache that number in the data_vio. - * - * @param data_vio The data_vio - * @param callback The function to call once the slot has been found - * @param thread_id The thread on which to run the callback - **/ -void vdo_find_block_map_slot(struct data_vio *data_vio, - vdo_action *callback, - thread_id_t thread_id); - -/** - * Get number of block map entries. - * - * @param map The block map - * - * @return The number of entries stored in the map - **/ -block_count_t __must_check -vdo_get_number_of_block_map_entries(const struct block_map *map); - -/** - * Notify the block map that the recovery journal has finished a new block. - * This method must be called from the journal zone thread. - * - * @param map The block map - * @param recovery_block_number The sequence number of the finished recovery - * journal block - **/ -void advance_vdo_block_map_era(struct block_map *map, - sequence_number_t recovery_block_number); - - -/** - * Update an entry on a block map page. - * - * @param [in] page The page to update - * @param [in] data_vio The data_vio making the update - * @param [in] pbn The new PBN for the entry - * @param [in] mapping_state The new mapping state for the entry - * @param [in,out] recovery_lock A reference to the current recovery sequence - * number lock held by the page. Will be updated - * if the lock changes to protect the new entry - **/ -void update_vdo_block_map_page(struct block_map_page *page, - struct data_vio *data_vio, - physical_block_number_t pbn, - enum block_mapping_state mapping_state, - sequence_number_t *recovery_lock); - -/** - * Get the block number of the physical block containing the data for the - * specified logical block number. All blocks are mapped to physical block - * zero by default, which is conventionally the zero block. - * - * @param data_vio The data_vio of the block to map - **/ -void vdo_get_mapped_block(struct data_vio *data_vio); - -/** - * Associate the logical block number for a block represented by a data_vio - * with the physical block number in its new_mapped field. - * - * @param data_vio The data_vio of the block to map - **/ -void vdo_put_mapped_block(struct data_vio *data_vio); - -/** - * Get the stats for the block map page cache. - * - * @param map The block map containing the cache - * - * @return The block map statistics - **/ -struct block_map_statistics __must_check -get_vdo_block_map_statistics(struct block_map *map); - -#endif // BLOCK_MAP_H diff --git a/vdo/blockMapEntry.h b/vdo/blockMapEntry.h deleted file mode 100644 index 28a07166..00000000 --- a/vdo/blockMapEntry.h +++ /dev/null @@ -1,113 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/blockMapEntry.h#10 $ - */ - -#ifndef BLOCK_MAP_ENTRY_H -#define BLOCK_MAP_ENTRY_H - -#include "blockMappingState.h" -#include "constants.h" -#include "numeric.h" -#include "types.h" - -/** - * The entry for each logical block in the block map is encoded into five - * bytes, which saves space in both the on-disk and in-memory layouts. It - * consists of the 36 low-order bits of a physical_block_number_t - * (addressing 256 terabytes with a 4KB block size) and a 4-bit encoding of a - * block_mapping_state. - **/ -struct block_map_entry { - /** - * Bits 7..4: The four highest bits of the 36-bit physical block - * number - * Bits 3..0: The 4-bit block_mapping_state - **/ -#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ - unsigned mapping_state : 4; - unsigned pbn_high_nibble : 4; -#else - unsigned pbn_high_nibble : 4; - unsigned mapping_state : 4; -#endif - - /** - * 32 low-order bits of the 36-bit PBN, in little-endian byte - * order - */ - __le32 pbn_low_word; -} __packed; - -/** - * Unpack the fields of a block_map_entry, returning them as a data_location. - * - * @param entry A pointer to the entry to unpack - * - * @return the location of the data mapped by the block map entry - **/ -static inline struct data_location -unpack_vdo_block_map_entry(const struct block_map_entry *entry) -{ - physical_block_number_t low32 = __le32_to_cpu(entry->pbn_low_word); - physical_block_number_t high4 = entry->pbn_high_nibble; - return (struct data_location) { - .pbn = ((high4 << 32) | low32), - .state = entry->mapping_state, - }; -} - -/**********************************************************************/ -static inline bool vdo_is_mapped_location(const struct data_location *location) -{ - return (location->state != VDO_MAPPING_STATE_UNMAPPED); -} - -/**********************************************************************/ -static inline bool vdo_is_valid_location(const struct data_location *location) -{ - if (location->pbn == VDO_ZERO_BLOCK) { - return !vdo_is_state_compressed(location->state); - } else { - return vdo_is_mapped_location(location); - } -} - -/** - * Pack a physical_block_number_t into a block_map_entry. - * - * @param pbn The physical block number to convert to its - * packed five-byte representation - * @param mapping_state The mapping state of the block - * - * @return the packed representation of the block number and mapping state - * - * @note unrepresentable high bits of the unpacked PBN are silently truncated - **/ -static inline struct block_map_entry -pack_vdo_pbn(physical_block_number_t pbn, enum block_mapping_state mapping_state) -{ - return (struct block_map_entry) { - .mapping_state = (mapping_state & 0x0F), - .pbn_high_nibble = ((pbn >> 32) & 0x0F), - .pbn_low_word = __cpu_to_le32(pbn & UINT_MAX), - }; -} - -#endif // BLOCK_MAP_ENTRY_H diff --git a/vdo/blockMapFormat.h b/vdo/blockMapFormat.h deleted file mode 100644 index d765dd2a..00000000 --- a/vdo/blockMapFormat.h +++ /dev/null @@ -1,132 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/blockMapFormat.h#8 $ - */ - -#ifndef BLOCK_MAP_FORMAT_H -#define BLOCK_MAP_FORMAT_H - -#include "buffer.h" - -#include "constants.h" -#include "header.h" -#include "types.h" - -struct block_map_state_2_0 { - physical_block_number_t flat_page_origin; - block_count_t flat_page_count; - physical_block_number_t root_origin; - block_count_t root_count; -} __packed; - -struct boundary { - page_number_t levels[VDO_BLOCK_MAP_TREE_HEIGHT]; -}; - -extern const struct header VDO_BLOCK_MAP_HEADER_2_0; - -/** - * Compute the number of the block map page on which the entry for a given - * logical block resides. - * - * @param lbn The logical block number whose page is desired - * - * @return The number of the block map page containing the entry for - * the given logical block number - **/ -static inline page_number_t __must_check -vdo_compute_page_number(logical_block_number_t lbn) -{ - return (lbn / VDO_BLOCK_MAP_ENTRIES_PER_PAGE); -} - -/** - * Find the block map page slot in which the entry for a given logical - * block resides. - * - * @param lbn The logical block number whose slot - * - * @return The slot containing the entry for the given logical block number - **/ -static inline slot_number_t __must_check -vdo_compute_slot(logical_block_number_t lbn) -{ - return (lbn % VDO_BLOCK_MAP_ENTRIES_PER_PAGE); -} - -/** - * Decode block map component state version 2.0 from a buffer. - * - * @param buffer A buffer positioned at the start of the encoding - * @param state The state structure to receive the decoded values - * - * @return UDS_SUCCESS or an error code - **/ -int __must_check -decode_vdo_block_map_state_2_0(struct buffer *buffer, - struct block_map_state_2_0 *state); - -/** - * Get the size of the encoded state of a block map. - * - * @return The encoded size of the map's state - **/ -size_t __must_check get_vdo_block_map_encoded_size(void); - -/** - * Encode the state of a block map into a buffer. - * - * @param state The block map state to encode - * @param buffer The buffer to encode into - * - * @return UDS_SUCCESS or an error - **/ -int __must_check -encode_vdo_block_map_state_2_0(struct block_map_state_2_0 state, - struct buffer *buffer); - -/** - * Compute the number of pages required for a block map with the specified - * parameters. - * - * @param entries The number of block map entries - * - * @return The number of pages required - **/ -page_count_t compute_vdo_block_map_page_count(block_count_t entries); - -/** - * Compute the number of pages which must be allocated at each level in order - * to grow the forest to a new number of entries. - * - * @param [in] root_count The number of roots - * @param [in] old_sizes The current size of the forest at each level - * @param [in] entries The new number of entries the block map must - * address - * @param [out] new_sizes The new size of the forest at each level - * - * @return The total number of non-leaf pages required - **/ -block_count_t __must_check -vdo_compute_new_forest_pages(root_count_t root_count, - struct boundary *old_sizes, - block_count_t entries, - struct boundary *new_sizes); - -#endif // BLOCK_MAP_FORMAT_H diff --git a/vdo/blockMapInternals.h b/vdo/blockMapInternals.h deleted file mode 100644 index f164ebd6..00000000 --- a/vdo/blockMapInternals.h +++ /dev/null @@ -1,125 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/blockMapInternals.h#4 $ - */ - -#ifndef BLOCK_MAP_INTERNALS_H -#define BLOCK_MAP_INTERNALS_H - -#include "adminState.h" -#include "blockMapEntry.h" -#include "blockMapTree.h" -#include "completion.h" -#include "dirtyLists.h" -#include "header.h" -#include "intMap.h" -#include "types.h" -#include "vdoPageCache.h" -#include "vioPool.h" - -/** - * The per-zone fields used by the block map tree. - **/ -struct block_map_tree_zone { - /** The struct block_map_zone which owns this tree zone */ - struct block_map_zone *map_zone; - /** The lists of dirty tree pages */ - struct dirty_lists *dirty_lists; - /** The number of tree lookups in progress */ - vio_count_t active_lookups; - /** The map of pages currently being loaded */ - struct int_map *loading_pages; - /** The pool of vios for tree I/O */ - struct vio_pool *vio_pool; - /** The tree page which has issued or will be issuing a flush */ - struct tree_page *flusher; - /** The queue of pages waiting for a flush so they can be written out */ - struct wait_queue flush_waiters; - /** The generation after the most recent flush */ - uint8_t generation; - /** The oldest active generation */ - uint8_t oldest_generation; - /** The counts of dirty pages in each generation */ - uint32_t dirty_page_counts[256]; -}; - -/** - * The per-zone fields of the block map. - **/ -struct block_map_zone { - /** The number of the zone this is */ - zone_count_t zone_number; - /** The ID of this zone's logical thread */ - thread_id_t thread_id; - /** The block_map which owns this block_map_zone */ - struct block_map *block_map; - /** The read_only_notifier of the VDO */ - struct read_only_notifier *read_only_notifier; - /** The page cache for this zone */ - struct vdo_page_cache *page_cache; - /** The per-zone portion of the tree for this zone */ - struct block_map_tree_zone tree_zone; - /** The administrative state of the zone */ - struct admin_state state; -}; - -struct block_map { - /** The manager for block map actions */ - struct action_manager *action_manager; - /** - * The absolute PBN of the first root of the tree part of the block map - */ - physical_block_number_t root_origin; - /** The count of root pages of the tree part of the block map */ - block_count_t root_count; - - /** The era point we are currently distributing to the zones */ - sequence_number_t current_era_point; - /** The next era point, not yet distributed to any zone */ - sequence_number_t pending_era_point; - - /** The number of entries in block map */ - block_count_t entry_count; - /** The VDO's nonce, for the pages */ - nonce_t nonce; - /** The recovery journal for this map */ - struct recovery_journal *journal; - - /** The trees for finding block map pages */ - struct forest *forest; - /** The expanded trees awaiting growth */ - struct forest *next_forest; - /** The number of entries after growth */ - block_count_t next_entry_count; - - /** The number of logical zones */ - zone_count_t zone_count; - /** The per zone block map structure */ - struct block_map_zone zones[]; -}; - -/** - * Check whether a zone of the block map has drained, and if so, send a - * notification thereof. - * - * @param zone The zone to check - **/ -void vdo_check_for_drain_complete(struct block_map_zone *zone); - -#endif // BLOCK_MAP_INTERNALS_H diff --git a/vdo/blockMapPage.c b/vdo/blockMapPage.c deleted file mode 100644 index 8770048d..00000000 --- a/vdo/blockMapPage.c +++ /dev/null @@ -1,84 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/blockMapPage.c#13 $ - */ - -#include "blockMapPage.h" - -#include "permassert.h" - -#include "constants.h" -#include "statusCodes.h" -#include "types.h" - -enum { - PAGE_HEADER_4_1_SIZE = 8 + 8 + 8 + 1 + 1 + 1 + 1, -}; - -static const struct version_number BLOCK_MAP_4_1 = { - .major_version = 4, - .minor_version = 1, -}; - -/**********************************************************************/ -bool is_current_vdo_block_map_page(const struct block_map_page *page) -{ - return are_same_vdo_version(BLOCK_MAP_4_1, - unpack_vdo_version_number(page->version)); -} - -/**********************************************************************/ -struct block_map_page *format_vdo_block_map_page(void *buffer, - nonce_t nonce, - physical_block_number_t pbn, - bool initialized) -{ - struct block_map_page *page = (struct block_map_page *) buffer; - memset(buffer, 0, VDO_BLOCK_SIZE); - page->version = pack_vdo_version_number(BLOCK_MAP_4_1); - page->header.nonce = __cpu_to_le64(nonce); - page->header.pbn = __cpu_to_le64(pbn); - page->header.initialized = initialized; - return page; -} - -/**********************************************************************/ -enum block_map_page_validity -validate_vdo_block_map_page(struct block_map_page *page, - nonce_t nonce, - physical_block_number_t pbn) -{ - // Make sure the page layout isn't accidentally changed by changing the - // length of the page header. - STATIC_ASSERT_SIZEOF(struct block_map_page_header, - PAGE_HEADER_4_1_SIZE); - - if (!are_same_vdo_version(BLOCK_MAP_4_1, - unpack_vdo_version_number(page->version)) || - !is_vdo_block_map_page_initialized(page) || - (nonce != __le64_to_cpu(page->header.nonce))) { - return VDO_BLOCK_MAP_PAGE_INVALID; - } - - if (pbn != get_vdo_block_map_page_pbn(page)) { - return VDO_BLOCK_MAP_PAGE_BAD; - } - - return VDO_BLOCK_MAP_PAGE_VALID; -} diff --git a/vdo/blockMapPage.h b/vdo/blockMapPage.h deleted file mode 100644 index c379449e..00000000 --- a/vdo/blockMapPage.h +++ /dev/null @@ -1,171 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/blockMapPage.h#11 $ - */ - -#ifndef BLOCK_MAP_PAGE_H -#define BLOCK_MAP_PAGE_H - -#include "numeric.h" - -#include "blockMapEntry.h" -#include "header.h" -#include "types.h" - -/** - * The packed, on-disk representation of a block map page header. - **/ -struct block_map_page_header { - /** - * The 64-bit nonce of the current VDO, in little-endian byte order. - * Used to determine whether or not a page has been formatted. - **/ - __le64 nonce; - - /** The 64-bit PBN of this page, in little-endian byte order */ - __le64 pbn; - - /** Formerly recovery_sequence_number; may be non-zero on disk */ - byte unused_long_word[8]; - - /** - * Whether this page has been initialized on disk (i.e. written twice) - */ - bool initialized; - - /** - * Formerly entry_offset; now unused since it should always be zero - */ - byte unused_byte1; - - /** Formerly interior_tree_page_writing; may be non-zero on disk */ - byte unused_byte2; - - /** - * Formerly generation (for dirty tree pages); may be non-zero on disk - */ - byte unused_byte3; -} __packed; - -/** - * The format of a block map page. - **/ -struct block_map_page { - struct packed_version_number version; - struct block_map_page_header header; - struct block_map_entry entries[]; -} __packed; - -enum block_map_page_validity { - // A block map page is correctly initialized - VDO_BLOCK_MAP_PAGE_VALID, - // A block map page is uninitialized - VDO_BLOCK_MAP_PAGE_INVALID, - // A block map page is intialized, but is the wrong page - VDO_BLOCK_MAP_PAGE_BAD, -}; - -/** - * Check whether a block map page has been initialized. - * - * @param page The page to check - * - * @return true if the page has been initialized - **/ -static inline bool __must_check -is_vdo_block_map_page_initialized(const struct block_map_page *page) -{ - return page->header.initialized; -} - -/** - * Mark whether a block map page has been initialized. - * - * @param page The page to mark - * @param initialized The state to set - * - * @return true if the initialized flag was modified - **/ -static inline bool -mark_vdo_block_map_page_initialized(struct block_map_page *page, - bool initialized) -{ - if (initialized == page->header.initialized) { - return false; - } - - page->header.initialized = initialized; - return true; -} - -/** - * Get the physical block number where a block map page is stored. - * - * @param page The page to query - * - * @return the page's physical block number - **/ -static inline physical_block_number_t __must_check -get_vdo_block_map_page_pbn(const struct block_map_page *page) -{ - return __le64_to_cpu(page->header.pbn); -} - -/** - * Check whether a block map page is of the current version. - * - * @param page The page to check - * - * @return true if the page has the current version - **/ -bool __must_check -is_current_vdo_block_map_page(const struct block_map_page *page); - -/** - * Format a block map page in memory. - * - * @param buffer The buffer which holds the page - * @param nonce The VDO nonce - * @param pbn The absolute PBN of the page - * @param initialized Whether the page should be marked as initialized - * - * @return the buffer pointer, as a block map page (for convenience) - **/ -struct block_map_page *format_vdo_block_map_page(void *buffer, - nonce_t nonce, - physical_block_number_t pbn, - bool initialized); - -/** - * Check whether a newly read page is valid, upgrading its in-memory format if - * possible and necessary. If the page is valid, clear fields which are not - * meaningful on disk. - * - * @param page The page to validate - * @param nonce The VDO nonce - * @param pbn The expected absolute PBN of the page - * - * @return The validity of the page - **/ -enum block_map_page_validity __must_check -validate_vdo_block_map_page(struct block_map_page *page, - nonce_t nonce, - physical_block_number_t pbn); - -#endif // BLOCK_MAP_PAGE_H diff --git a/vdo/blockMapRecovery.h b/vdo/blockMapRecovery.h deleted file mode 100644 index ab8486a8..00000000 --- a/vdo/blockMapRecovery.h +++ /dev/null @@ -1,57 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/blockMapRecovery.h#8 $ - */ - -#ifndef BLOCK_MAP_RECOVERY_H -#define BLOCK_MAP_RECOVERY_H - -#include "blockMap.h" -#include "blockMappingState.h" -#include "types.h" - -/** - * An explicitly numbered block mapping. Numbering the mappings allows them to - * be sorted by logical block number during recovery while still preserving - * the relative order of journal entries with the same logical block number. - **/ -struct numbered_block_mapping { - // Block map slot to map - struct block_map_slot block_map_slot; - // The encoded block map entry for the LBN - struct block_map_entry block_map_entry; - // The serial number to use during replay - uint32_t number; -} __packed; - -/** - * Recover the block map (normal rebuild). - * - * @param vdo The vdo - * @param entry_count The number of journal entries - * @param journal_entries An array of journal entries to process - * @param parent The completion to notify when the rebuild - * is complete - **/ -void recover_vdo_block_map(struct vdo *vdo, - block_count_t entry_count, - struct numbered_block_mapping *journal_entries, - struct vdo_completion *parent); - -#endif // BLOCK_MAP_RECOVERY_H diff --git a/vdo/blockMapTree.h b/vdo/blockMapTree.h deleted file mode 100644 index 97f50778..00000000 --- a/vdo/blockMapTree.h +++ /dev/null @@ -1,120 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/blockMapTree.h#12 $ - */ - -#ifndef BLOCK_MAP_TREE_H -#define BLOCK_MAP_TREE_H - -#include "constants.h" -#include "types.h" - -struct tree_page; - -/** - * Intialize a block_map_tree_zone. - * - * @param zone The block_map_zone of the tree zone to intialize - * @param vdo The vdo - * @param maximum_age The number of journal blocks before a dirtied page is - * considered old and may be written out - * - * @return VDO_SUCCESS or an error - **/ -int __must_check vdo_initialize_tree_zone(struct block_map_zone *zone, - struct vdo *vdo, - block_count_t maximum_age); - -/** - * Clean up a block_map_tree_zone. - * - * @param tree_zone The zone to clean up - **/ -void vdo_uninitialize_block_map_tree_zone(struct block_map_tree_zone *tree_zone); - -/** - * Set the initial dirty period for a tree zone. - * - * @param tree_zone The tree zone - * @param period The initial dirty period to set - **/ -void vdo_set_tree_zone_initial_period(struct block_map_tree_zone *tree_zone, - sequence_number_t period); - -/** - * Check whether a tree zone is active (i.e. has any active lookups, - * outstanding I/O, or pending I/O). - * - * @param zone The zone to check - * - * @return true if the zone is active - **/ -bool __must_check vdo_is_tree_zone_active(struct block_map_tree_zone *zone); - -/** - * Advance the dirty period for a tree zone. - * - * @param zone The block_map_tree_zone to advance - * @param period The new dirty period - **/ -void vdo_advance_zone_tree_period(struct block_map_tree_zone *zone, - sequence_number_t period); - -/** - * Drain the zone trees, i.e. ensure that all I/O is quiesced. If required by - * the drain type, all dirty block map trees will be written to disk. This - * method must not be called when lookups are active. - * - * @param zone The block_map_tree_zone to drain - **/ -void vdo_drain_zone_trees(struct block_map_tree_zone *zone); - -/** - * Look up the PBN of the block map page for a data_vio's LBN in the arboreal - * block map. If necessary, the block map page will be allocated. Also, the - * ancestors of the block map page will be allocated or loaded if necessary. - * - * @param data_vio The data_vio requesting the lookup - **/ -void vdo_lookup_block_map_pbn(struct data_vio *data_vio); - -/** - * Find the PBN of a leaf block map page. This method may only be used after - * all allocated tree pages have been loaded, otherwise, it may give the wrong - * answer (0). - * - * @param map The block map containing the forest - * @param page_number The page number of the desired block map page - * - * @return The PBN of the page - **/ -physical_block_number_t vdo_find_block_map_page_pbn(struct block_map *map, - page_number_t page_number); - -/** - * Write a tree page or indicate that it has been re-dirtied if it is already - * being written. This method is used when correcting errors in the tree during - * read-only rebuild. - * - * @param page The page to write - * @param zone The tree zone managing the page - **/ -void vdo_write_tree_page(struct tree_page *page, struct block_map_tree_zone *zone); - -#endif // BLOCK_MAP_TREE_H diff --git a/vdo/blockMapTreeInternals.h b/vdo/blockMapTreeInternals.h deleted file mode 100644 index 02afc92e..00000000 --- a/vdo/blockMapTreeInternals.h +++ /dev/null @@ -1,117 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/blockMapTreeInternals.h#4 $ - */ - -#ifndef BLOCK_MAP_TREE_INTERNALS_H -#define BLOCK_MAP_TREE_INTERNALS_H - -#include "blockMapTree.h" - -#include "blockMapFormat.h" -#include "blockMapPage.h" -#include "types.h" - -/** A single page of a block map tree */ -struct tree_page { - /** struct waiter for a VIO to write out this page */ - struct waiter waiter; - - /** Dirty list entry */ - struct list_head entry; - - /** - * If this is a dirty tree page, the tree zone flush generation in which - * it was last dirtied. - */ - uint8_t generation; - - /** Whether this page is an interior tree page being written out. */ - bool writing; - - /** - * If this page is being written, the tree zone flush generation of the - * copy of the page being written. - **/ - uint8_t writing_generation; - - /** - * The earliest journal block containing uncommitted updates to this - * page - */ - sequence_number_t recovery_lock; - - /** - * The value of recovery_lock when the this page last started writing - */ - sequence_number_t writing_recovery_lock; - - /** The buffer to hold the on-disk representation of this page */ - char page_buffer[VDO_BLOCK_SIZE]; -}; - -/** - * An invalid PBN used to indicate that the page holding the location of a - * tree root has been "loaded". - **/ -extern const physical_block_number_t VDO_INVALID_PBN; - -/** - * Extract the block_map_page from a tree_page. - * - * @param tree_page The tree_page - * - * @return The block_map_page of the tree_page - **/ -static inline struct block_map_page * __must_check -as_vdo_block_map_page(struct tree_page *tree_page) -{ - return (struct block_map_page *) tree_page->page_buffer; -} - -/** - * Replace the VIOPool in a tree zone. This method is used by unit tests. - * - * @param zone The zone whose pool is to be replaced - * @param vdo The vdo from which to make VIOs - * @param pool_size The size of the new pool - * - * @return VDO_SUCCESS or an error - **/ -int __must_check -vdo_replace_tree_zone_vio_pool(struct block_map_tree_zone *zone, - struct vdo *vdo, - size_t pool_size); - -/** - * Check whether a buffer contains a valid page. If the page is bad, log an - * error. If the page is valid, copy it to the supplied page. - * - * @param buffer The buffer to validate (and copy) - * @param nonce The VDO nonce - * @param pbn The absolute PBN of the page - * @param page The page to copy into if valid - * - * @return true if the page was copied (valid) - **/ -bool vdo_copy_valid_page(char *buffer, nonce_t nonce, - physical_block_number_t pbn, - struct block_map_page *page); - -#endif // BLOCK_MAP_TREE_INTERNALS_H diff --git a/vdo/blockMappingState.h b/vdo/blockMappingState.h deleted file mode 100644 index eb7c57cc..00000000 --- a/vdo/blockMappingState.h +++ /dev/null @@ -1,68 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/blockMappingState.h#10 $ - */ - -#ifndef BLOCK_MAPPING_STATE_H -#define BLOCK_MAPPING_STATE_H - -#include "common.h" - -/** - * Four bits of each five-byte block map entry contain a mapping state value - * used to distinguish unmapped or trimmed logical blocks (which are treated - * as mapped to the zero block) from entries that have been mapped to a - * physical block, including the zero block. - **/ -enum block_mapping_state { - VDO_MAPPING_STATE_UNMAPPED = 0, // Must be zero to be the default value - VDO_MAPPING_STATE_UNCOMPRESSED = 1, // A normal (uncompressed) block - VDO_MAPPING_STATE_COMPRESSED_BASE = 2, // Compressed in slot 0 - VDO_MAPPING_STATE_COMPRESSED_MAX = 15, // Compressed in slot 13 -}; - -/** - * The total number of compressed blocks that can live in a physical block. - **/ -enum { - VDO_MAX_COMPRESSION_SLOTS = (VDO_MAPPING_STATE_COMPRESSED_MAX - - VDO_MAPPING_STATE_COMPRESSED_BASE + 1), -}; - -/**********************************************************************/ -static inline enum block_mapping_state vdo_get_state_for_slot(byte slot_number) -{ - return (slot_number + VDO_MAPPING_STATE_COMPRESSED_BASE); -} - -/**********************************************************************/ -static inline byte -vdo_get_slot_from_state(enum block_mapping_state mapping_state) -{ - return (mapping_state - VDO_MAPPING_STATE_COMPRESSED_BASE); -} - -/**********************************************************************/ -static inline bool -vdo_is_state_compressed(const enum block_mapping_state mapping_state) -{ - return (mapping_state > VDO_MAPPING_STATE_UNCOMPRESSED); -} - -#endif // BLOCK_MAPPING_STATE_H diff --git a/uds/buffer.c b/vdo/buffer.c similarity index 66% rename from uds/buffer.c rename to vdo/buffer.c index f3491108..068574f5 100644 --- a/uds/buffer.c +++ b/vdo/buffer.c @@ -1,44 +1,48 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/krusty/src/uds/buffer.c#15 $ */ #include "buffer.h" -#include "bufferPrivate.h" #include "logger.h" -#include "memoryAlloc.h" +#include "memory-alloc.h" #include "numeric.h" #include "permassert.h" -#include "typeDefs.h" +#include "type-defs.h" -/**********************************************************************/ +/* + * This is an implementation of a rolling buffer for marshalling data to and + * from storage. The put methods add data to the end of the buffer and advance + * the end pointer past the new data. The get methods return data from the + * start of the buffer and advance the start pointer past anything + * returned. Data is not actually removed until the buffer is cleared or + * compacted, so the same data can be read multiple times if desired. + */ + +/* + * Create a buffer which wraps an existing byte array. + * + * @param bytes The bytes to wrap + * @param length The length of the buffer + * @param content_length The length of the current contents of the buffer + * @param buffer_ptr A pointer to hold the buffer + * + * @return UDS_SUCCESS or an error code + */ int wrap_buffer(byte *bytes, size_t length, size_t content_length, struct buffer **buffer_ptr) { - int result = ASSERT((content_length <= length), - "content length, %zu, fits in buffer size, %zu", - length, - content_length); + int result; struct buffer *buffer; + + result = ASSERT((content_length <= length), + "content length, %zu, fits in buffer size, %zu", + length, + content_length); + result = UDS_ALLOCATE(1, struct buffer, "buffer", &buffer); if (result != UDS_SUCCESS) { return result; @@ -54,12 +58,21 @@ int wrap_buffer(byte *bytes, return UDS_SUCCESS; } -/**********************************************************************/ +/* + * Create a new buffer and allocate its memory. + * + * @param length The length of the buffer + * @param buffer_ptr A pointer to hold the buffer + * + * @return UDS_SUCCESS or an error code + */ int make_buffer(size_t size, struct buffer **new_buffer) { + int result; byte *data; struct buffer *buffer; - int result = UDS_ALLOCATE(size, byte, "buffer data", &data); + + result = UDS_ALLOCATE(size, byte, "buffer data", &data); if (result != UDS_SUCCESS) { return result; } @@ -75,7 +88,6 @@ int make_buffer(size_t size, struct buffer **new_buffer) return UDS_SUCCESS; } -/**********************************************************************/ void free_buffer(struct buffer *buffer) { if (buffer == NULL) { @@ -89,80 +101,91 @@ void free_buffer(struct buffer *buffer) UDS_FREE(buffer); } -/**********************************************************************/ size_t buffer_length(struct buffer *buffer) { return buffer->length; } -/**********************************************************************/ +/* Return the amount of data currently in the buffer. */ size_t content_length(struct buffer *buffer) { return buffer->end - buffer->start; } -/**********************************************************************/ +/* Return the amount of data that has already been processed. */ size_t uncompacted_amount(struct buffer *buffer) { return buffer->start; } -/**********************************************************************/ +/* Return the amount of space available in the buffer. */ size_t available_space(struct buffer *buffer) { return buffer->length - buffer->end; } -/**********************************************************************/ +/* Return the amount of the buffer that is currently utilized. */ size_t buffer_used(struct buffer *buffer) { return buffer->end; } -/**********************************************************************/ +/* + * Ensure that a buffer has a given amount of space available, compacting the + * buffer if necessary. Returns true if the space is available. + */ bool ensure_available_space(struct buffer *buffer, size_t bytes) { if (available_space(buffer) >= bytes) { return true; } + compact_buffer(buffer); return (available_space(buffer) >= bytes); } -/**********************************************************************/ void clear_buffer(struct buffer *buffer) { buffer->start = 0; buffer->end = buffer->length; } -/**********************************************************************/ +/* + * Eliminate buffer contents which have been extracted. This function copies + * any data between the start and end pointers to the beginning of the buffer, + * moves the start pointer to the beginning, and the end pointer to the end of + * the copied data. + */ void compact_buffer(struct buffer *buffer) { size_t bytes_to_move; + if ((buffer->start == 0) || (buffer->end == 0)) { return; } + bytes_to_move = buffer->end - buffer->start; memmove(buffer->data, buffer->data + buffer->start, bytes_to_move); buffer->start = 0; buffer->end = bytes_to_move; } -/**********************************************************************/ +/* Reset the end of buffer to a different position. */ int reset_buffer_end(struct buffer *buffer, size_t end) { if (end > buffer->length) { return UDS_BUFFER_ERROR; } + buffer->end = end; if (buffer->start > buffer->end) { buffer->start = buffer->end; } + return UDS_SUCCESS; } -/**********************************************************************/ +/* Advance the start pointer by the specified number of bytes. */ int skip_forward(struct buffer *buffer, size_t bytes_to_skip) { if (content_length(buffer) < bytes_to_skip) { @@ -173,7 +196,7 @@ int skip_forward(struct buffer *buffer, size_t bytes_to_skip) return UDS_SUCCESS; } -/**********************************************************************/ +/* Rewind the start pointer by the specified number of bytes. */ int rewind_buffer(struct buffer *buffer, size_t bytes_to_rewind) { if (buffer->start < bytes_to_rewind) { @@ -184,14 +207,17 @@ int rewind_buffer(struct buffer *buffer, size_t bytes_to_rewind) return UDS_SUCCESS; } -/**********************************************************************/ +/* + * Check whether the start of the contents of a buffer matches a specified + * array of bytes. + */ bool has_same_bytes(struct buffer *buffer, const byte *data, size_t length) { return ((content_length(buffer) >= length) && (memcmp(buffer->data + buffer->start, data, length) == 0)); } -/**********************************************************************/ +/* Check whether two buffers have the same contents. */ bool equal_buffers(struct buffer *buffer1, struct buffer *buffer2) { return has_same_bytes(buffer1, @@ -199,7 +225,6 @@ bool equal_buffers(struct buffer *buffer1, struct buffer *buffer2) content_length(buffer2)); } -/**********************************************************************/ int get_byte(struct buffer *buffer, byte *byte_ptr) { if (content_length(buffer) < sizeof(byte)) { @@ -210,7 +235,6 @@ int get_byte(struct buffer *buffer, byte *byte_ptr) return UDS_SUCCESS; } -/**********************************************************************/ int put_byte(struct buffer *buffer, byte b) { if (!ensure_available_space(buffer, sizeof(byte))) { @@ -221,7 +245,6 @@ int put_byte(struct buffer *buffer, byte b) return UDS_SUCCESS; } -/**********************************************************************/ int get_bytes_from_buffer(struct buffer *buffer, size_t length, void *destination) { @@ -234,19 +257,30 @@ int get_bytes_from_buffer(struct buffer *buffer, size_t length, return UDS_SUCCESS; } -/**********************************************************************/ +/* + * Get a pointer to the current contents of the buffer. This will be a pointer + * to the actual memory managed by the buffer. It is the caller's + * responsibility to ensure that the buffer is not modified while this pointer + * is in use. + */ byte *get_buffer_contents(struct buffer *buffer) { return buffer->data + buffer->start; } -/**********************************************************************/ +/* + * Copy bytes out of a buffer as per get_bytes_fom_buffer(). Memory will be + * allocated to hold the copy. + */ int copy_bytes(struct buffer *buffer, size_t length, byte **destination_ptr) { + int result; byte *destination; - int result = - UDS_ALLOCATE(length, byte, "copy_bytes() buffer", - &destination); + + result = UDS_ALLOCATE(length, + byte, + "copy_bytes() buffer", + &destination); if (result != UDS_SUCCESS) { return result; } @@ -257,24 +291,30 @@ int copy_bytes(struct buffer *buffer, size_t length, byte **destination_ptr) } else { *destination_ptr = destination; } + return result; } -/**********************************************************************/ int put_bytes(struct buffer *buffer, size_t length, const void *source) { if (!ensure_available_space(buffer, length)) { return UDS_BUFFER_ERROR; } + memcpy(buffer->data + buffer->end, source, length); buffer->end += length; return UDS_SUCCESS; } -/**********************************************************************/ +/* + * Copy the contents of a source buffer into the target buffer. This is + * equivalent to calling get_byte() on the source and put_byte() on the + * target repeatedly. + */ int put_buffer(struct buffer *target, struct buffer *source, size_t length) { int result; + if (content_length(source) < length) { return UDS_BUFFER_ERROR; } @@ -288,35 +328,36 @@ int put_buffer(struct buffer *target, struct buffer *source, size_t length) return UDS_SUCCESS; } -/**********************************************************************/ +/* Put the specified number of zero bytes in the buffer. */ int zero_bytes(struct buffer *buffer, size_t length) { if (!ensure_available_space(buffer, length)) { return UDS_BUFFER_ERROR; } + memset(buffer->data + buffer->end, 0, length); buffer->end += length; return UDS_SUCCESS; } -/**********************************************************************/ int get_boolean(struct buffer *buffer, bool *b) { - byte by; - int result = get_byte(buffer, &by); + int result; + byte value; + + result = get_byte(buffer, &value); if (result == UDS_SUCCESS) { - *b = (by == 1); + *b = (value == 1); } + return result; } -/**********************************************************************/ int put_boolean(struct buffer *buffer, bool b) { return put_byte(buffer, (byte) (b ? 1 : 0)); } -/**********************************************************************/ int get_uint16_le_from_buffer(struct buffer *buffer, uint16_t *ui) { if (content_length(buffer) < sizeof(uint16_t)) { @@ -327,7 +368,6 @@ int get_uint16_le_from_buffer(struct buffer *buffer, uint16_t *ui) return UDS_SUCCESS; } -/**********************************************************************/ int put_uint16_le_into_buffer(struct buffer *buffer, uint16_t ui) { if (!ensure_available_space(buffer, sizeof(uint16_t))) { @@ -338,11 +378,11 @@ int put_uint16_le_into_buffer(struct buffer *buffer, uint16_t ui) return UDS_SUCCESS; } -/**********************************************************************/ int get_uint16_les_from_buffer(struct buffer *buffer, size_t count, uint16_t *ui) { unsigned int i; + if (content_length(buffer) < (sizeof(uint16_t) * count)) { return UDS_BUFFER_ERROR; } @@ -350,15 +390,16 @@ int get_uint16_les_from_buffer(struct buffer *buffer, size_t count, for (i = 0; i < count; i++) { decode_uint16_le(buffer->data, &buffer->start, ui + i); } + return UDS_SUCCESS; } -/**********************************************************************/ int put_uint16_les_into_buffer(struct buffer *buffer, - size_t count, - const uint16_t *ui) + size_t count, + const uint16_t *ui) { unsigned int i; + if (!ensure_available_space(buffer, sizeof(uint16_t) * count)) { return UDS_BUFFER_ERROR; } @@ -366,10 +407,10 @@ int put_uint16_les_into_buffer(struct buffer *buffer, for (i = 0; i < count; i++) { encode_uint16_le(buffer->data, &buffer->end, ui[i]); } + return UDS_SUCCESS; } -/**********************************************************************/ int get_int32_le_from_buffer(struct buffer *buffer, int32_t *i) { if (content_length(buffer) < sizeof(int32_t)) { @@ -380,7 +421,6 @@ int get_int32_le_from_buffer(struct buffer *buffer, int32_t *i) return UDS_SUCCESS; } -/**********************************************************************/ int get_uint32_le_from_buffer(struct buffer *buffer, uint32_t *ui) { if (content_length(buffer) < sizeof(uint32_t)) { @@ -391,7 +431,6 @@ int get_uint32_le_from_buffer(struct buffer *buffer, uint32_t *ui) return UDS_SUCCESS; } -/**********************************************************************/ int put_uint32_le_into_buffer(struct buffer *buffer, uint32_t ui) { if (!ensure_available_space(buffer, sizeof(uint32_t))) { @@ -402,7 +441,6 @@ int put_uint32_le_into_buffer(struct buffer *buffer, uint32_t ui) return UDS_SUCCESS; } -/**********************************************************************/ int put_int64_le_into_buffer(struct buffer *buffer, int64_t i) { if (!ensure_available_space(buffer, sizeof(int64_t))) { @@ -413,7 +451,6 @@ int put_int64_le_into_buffer(struct buffer *buffer, int64_t i) return UDS_SUCCESS; } -/**********************************************************************/ int get_uint64_le_from_buffer(struct buffer *buffer, uint64_t *ui) { if (content_length(buffer) < sizeof(uint64_t)) { @@ -424,7 +461,6 @@ int get_uint64_le_from_buffer(struct buffer *buffer, uint64_t *ui) return UDS_SUCCESS; } -/**********************************************************************/ int put_uint64_le_into_buffer(struct buffer *buffer, uint64_t ui) { if (!ensure_available_space(buffer, sizeof(uint64_t))) { @@ -435,11 +471,11 @@ int put_uint64_le_into_buffer(struct buffer *buffer, uint64_t ui) return UDS_SUCCESS; } -/**********************************************************************/ int get_uint64_les_from_buffer(struct buffer *buffer, size_t count, uint64_t *ui) { unsigned int i; + if (content_length(buffer) < (sizeof(uint64_t) * count)) { return UDS_BUFFER_ERROR; } @@ -447,15 +483,16 @@ int get_uint64_les_from_buffer(struct buffer *buffer, size_t count, for (i = 0; i < count; i++) { decode_uint64_le(buffer->data, &buffer->start, ui + i); } + return UDS_SUCCESS; } -/**********************************************************************/ int put_uint64_les_into_buffer(struct buffer *buffer, - size_t count, - const uint64_t *ui) + size_t count, + const uint64_t *ui) { unsigned int i; + if (!ensure_available_space(buffer, sizeof(uint64_t) * count)) { return UDS_BUFFER_ERROR; } @@ -463,5 +500,6 @@ int put_uint64_les_into_buffer(struct buffer *buffer, for (i = 0; i < count; i++) { encode_uint64_le(buffer->data, &buffer->end, ui[i]); } + return UDS_SUCCESS; } diff --git a/vdo/buffer.h b/vdo/buffer.h new file mode 100644 index 00000000..6b51a947 --- /dev/null +++ b/vdo/buffer.h @@ -0,0 +1,89 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright Red Hat + */ + +#ifndef BUFFER_H +#define BUFFER_H + +#include "common.h" + +struct buffer { + size_t start; + size_t end; + size_t length; + byte *data; + bool wrapped; +}; + +int __must_check wrap_buffer(byte *bytes, + size_t length, + size_t content_length, + struct buffer **buffer_ptr); + +int __must_check make_buffer(size_t length, struct buffer **buffer_ptr); +void free_buffer(struct buffer *buffer); + +bool __must_check ensure_available_space(struct buffer *buffer, size_t bytes); + +void clear_buffer(struct buffer *buffer); +void compact_buffer(struct buffer *buffer); +int __must_check skip_forward(struct buffer *buffer, size_t bytes_to_skip); +int __must_check rewind_buffer(struct buffer *buffer, size_t bytes_to_rewind); + +size_t buffer_length(struct buffer *buffer); +size_t content_length(struct buffer *buffer); +size_t available_space(struct buffer *buffer); +size_t uncompacted_amount(struct buffer *buffer); +size_t buffer_used(struct buffer *buffer); + +int __must_check reset_buffer_end(struct buffer *buffer, size_t end); + +bool __must_check +has_same_bytes(struct buffer *buffer, const byte *data, size_t length); +bool equal_buffers(struct buffer *buffer1, struct buffer *buffer2); + +int __must_check get_byte(struct buffer *buffer, byte *byte_ptr); +int __must_check put_byte(struct buffer *buffer, byte b); + +int __must_check +get_bytes_from_buffer(struct buffer *buffer, size_t length, void *destination); +byte *get_buffer_contents(struct buffer *buffer); +int __must_check +copy_bytes(struct buffer *buffer, size_t length, byte **destination_ptr); +int __must_check +put_bytes(struct buffer *buffer, size_t length, const void *source); +int __must_check +put_buffer(struct buffer *target, struct buffer *source, size_t length); + +int __must_check zero_bytes(struct buffer *buffer, size_t length); + +int __must_check get_boolean(struct buffer *buffer, bool *b); +int __must_check put_boolean(struct buffer *buffer, bool b); + +int __must_check get_uint16_le_from_buffer(struct buffer *buffer, uint16_t *ui); +int __must_check put_uint16_le_into_buffer(struct buffer *buffer, uint16_t ui); + +int __must_check +get_uint16_les_from_buffer(struct buffer *buffer, size_t count, uint16_t *ui); +int __must_check +put_uint16_les_into_buffer(struct buffer *buffer, + size_t count, + const uint16_t *ui); + +int __must_check get_int32_le_from_buffer(struct buffer *buffer, int32_t *i); +int __must_check get_uint32_le_from_buffer(struct buffer *buffer, uint32_t *ui); +int __must_check put_uint32_le_into_buffer(struct buffer *buffer, uint32_t ui); + +int __must_check get_uint64_le_from_buffer(struct buffer *buffer, uint64_t *ui); +int __must_check put_int64_le_into_buffer(struct buffer *buffer, int64_t i); +int __must_check put_uint64_le_into_buffer(struct buffer *buffer, uint64_t ui); + +int __must_check +get_uint64_les_from_buffer(struct buffer *buffer, size_t count, uint64_t *ui); +int __must_check +put_uint64_les_into_buffer(struct buffer *buffer, + size_t count, + const uint64_t *ui); + +#endif /* BUFFER_H */ diff --git a/vdo/bufferPool.c b/vdo/bufferPool.c deleted file mode 100644 index 3ff00f31..00000000 --- a/vdo/bufferPool.c +++ /dev/null @@ -1,270 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/kernel/bufferPool.c#12 $ - */ - -#include "bufferPool.h" - -#include -#include - -#include "logger.h" -#include "memoryAlloc.h" -#include "permassert.h" - -#include "statusCodes.h" - -/* - * For list nodes on the free-object list, the data field describes - * the object available for reuse. - * - * For nodes on the "spare" list, the data field is meaningless; - * they're just nodes available for use when we need to add an object - * pointer to the free_object_list. - * - * These are both "free lists", in a sense; don't get confused! - */ -struct buffer_element { - struct list_head list; // links in current list - void *data; // element data, if on free list -}; - -struct buffer_pool { - const char *name; // Pool name - spinlock_t lock; // Locks this object - unsigned int size; // Total number of buffers - struct list_head free_object_list; // List of free buffers - struct list_head spare_list_nodes; // Unused list nodes - unsigned int num_busy; // Number of buffers in use - unsigned int max_busy; // Maximum value of the above - buffer_allocate_function *alloc; // Allocate function for buffer data - buffer_free_function *free; // Free function for buffer data - buffer_dump_function *dump; // Dump function for buffer data - struct buffer_element *bhead; // Array of buffer_element - void **objects; -}; - -/*************************************************************************/ -int make_buffer_pool(const char *pool_name, - unsigned int size, - buffer_allocate_function *allocate_function, - buffer_free_function *free_function, - buffer_dump_function *dump_function, - struct buffer_pool **pool_ptr) -{ - struct buffer_pool *pool; - struct buffer_element *bh; - int i; - - int result = UDS_ALLOCATE(1, struct buffer_pool, "buffer pool", &pool); - - if (result != VDO_SUCCESS) { - uds_log_error("buffer pool allocation failure %d", result); - return result; - } - - result = UDS_ALLOCATE(size, struct buffer_element, - "buffer pool elements", &pool->bhead); - if (result != VDO_SUCCESS) { - uds_log_error("buffer element array allocation failure %d", - result); - free_buffer_pool(pool); - return result; - } - - result = UDS_ALLOCATE(size, void *, "object pointers", &pool->objects); - if (result != VDO_SUCCESS) { - uds_log_error("buffer object array allocation failure %d", - result); - free_buffer_pool(pool); - return result; - } - - pool->name = pool_name; - pool->alloc = allocate_function; - pool->free = free_function; - pool->dump = dump_function; - pool->size = size; - spin_lock_init(&pool->lock); - INIT_LIST_HEAD(&pool->free_object_list); - INIT_LIST_HEAD(&pool->spare_list_nodes); - bh = pool->bhead; - - for (i = 0; i < pool->size; i++) { - result = pool->alloc(&bh->data); - if (result != VDO_SUCCESS) { - uds_log_error("verify buffer data allocation failure %d", - result); - free_buffer_pool(pool); - return result; - } - pool->objects[i] = bh->data; - list_add(&bh->list, &pool->free_object_list); - bh++; - } - pool->num_busy = pool->max_busy = 0; - - *pool_ptr = pool; - return VDO_SUCCESS; -} - -/*************************************************************************/ -void free_buffer_pool(struct buffer_pool *pool) -{ - if (pool == NULL) { - return; - } - - ASSERT_LOG_ONLY((pool->num_busy == 0), - "freeing busy buffer pool, num_busy=%d", - pool->num_busy); - if (pool->objects != NULL) { - int i; - - for (i = 0; i < pool->size; i++) { - if (pool->objects[i] != NULL) { - pool->free(UDS_FORGET(pool->objects[i])); - } - } - UDS_FREE(UDS_FORGET(pool->objects)); - } - - UDS_FREE(UDS_FORGET(pool->bhead)); - UDS_FREE(pool); -} - -/*************************************************************************/ -static bool in_free_list(struct buffer_pool *pool, void *data) -{ - struct list_head *node; - struct buffer_element *bh; - - list_for_each(node, &pool->free_object_list) { - bh = list_entry(node, struct buffer_element, list); - if (bh->data == data) { - return true; - } - } - return false; -} - -/*************************************************************************/ -void dump_buffer_pool(struct buffer_pool *pool, bool dump_elements) -{ - // In order that syslog can empty its buffer, sleep after 35 elements - // for 4ms (till the second clock tick). These numbers chosen in - // October 2012 running on an lfarm. - enum { ELEMENTS_PER_BATCH = 35 }; - enum { SLEEP_FOR_SYSLOG = 4 }; - - if (pool == NULL) { - return; - } - spin_lock(&pool->lock); - uds_log_info("%s: %u of %u busy (max %u)", - pool->name, pool->num_busy, pool->size, pool->max_busy); - if (dump_elements && (pool->dump != NULL)) { - int dumped = 0; - int i; - - for (i = 0; i < pool->size; i++) { - if (!in_free_list(pool, pool->objects[i])) { - pool->dump(pool->objects[i]); - if (++dumped >= ELEMENTS_PER_BATCH) { - spin_unlock(&pool->lock); - dumped = 0; - msleep(SLEEP_FOR_SYSLOG); - spin_lock(&pool->lock); - } - } - } - } - spin_unlock(&pool->lock); -} - -/*************************************************************************/ -int alloc_buffer_from_pool(struct buffer_pool *pool, void **data_ptr) -{ - struct buffer_element *bh; - if (pool == NULL) { - return UDS_INVALID_ARGUMENT; - } - - spin_lock(&pool->lock); - if (unlikely(list_empty(&pool->free_object_list))) { - spin_unlock(&pool->lock); - uds_log_debug("no free buffers"); - return -ENOMEM; - } - - bh = list_first_entry(&pool->free_object_list, - struct buffer_element, list); - list_move(&bh->list, &pool->spare_list_nodes); - pool->num_busy++; - if (pool->num_busy > pool->max_busy) { - pool->max_busy = pool->num_busy; - } - *data_ptr = bh->data; - spin_unlock(&pool->lock); - return VDO_SUCCESS; -} - -/*************************************************************************/ -static bool free_buffer_to_pool_internal(struct buffer_pool *pool, void *data) -{ - struct buffer_element *bh; - if (unlikely(list_empty(&pool->spare_list_nodes))) { - return false; - } - bh = list_first_entry(&pool->spare_list_nodes, - struct buffer_element, list); - list_move(&bh->list, &pool->free_object_list); - bh->data = data; - pool->num_busy--; - return true; -} - -/*************************************************************************/ -void free_buffer_to_pool(struct buffer_pool *pool, void *data) -{ - bool success; - spin_lock(&pool->lock); - success = free_buffer_to_pool_internal(pool, data); - - spin_unlock(&pool->lock); - if (!success) { - uds_log_debug("trying to add to free list when already full"); - } -} - -/*************************************************************************/ -void free_buffers_to_pool(struct buffer_pool *pool, void **data, int count) -{ - bool success = true; - int i; - spin_lock(&pool->lock); - - for (i = 0; (i < count) && success; i++) { - success = free_buffer_to_pool_internal(pool, data[i]); - } - spin_unlock(&pool->lock); - if (!success) { - uds_log_debug("trying to add to free list when already full"); - } -} diff --git a/vdo/bufferPool.h b/vdo/bufferPool.h deleted file mode 100644 index 47d4169d..00000000 --- a/vdo/bufferPool.h +++ /dev/null @@ -1,173 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/kernel/bufferPool.h#7 $ - */ -#ifndef BUFFERPOOL_H -#define BUFFERPOOL_H - -/* - * We need bug.h because in 3.10, kernel.h (indirectly) defines - * ARRAY_SIZE as a macro which (indirectly and conditionally) uses - * BUILD_BUG_ON_ZERO, which is defined in bug.h, which is *not* - * included. In earlier versions like 3.2 it Just Worked. - */ -#include -#include -#include - -struct buffer_pool; - -typedef int buffer_allocate_function(void **data_ptr); -typedef void buffer_free_function(void *data); -typedef void buffer_dump_function(void *data); - -/** - * Creates a generic pool of buffer data. The elements in the pool are - * allocated up front and placed on a free list, which manages the - * reuse of the individual buffers in the pool. - * - * @param [in] pool_name Name of the pool - * @param [in] size The number of elements to create for this - * pool - * @param [in] allocate_function The function to call to create the actual - * data for each element - * @param [in] free_function The function to call to free the actual - * data for each element - * @param [in] dump_function The function to call to dump the actual - * data for each element into the log - * @param [out] pool_ptr A pointer to hold the pool that was created - * - * @return a success or error code - */ -int __must_check make_buffer_pool(const char *pool_name, - unsigned int size, - buffer_allocate_function *allocate_function, - buffer_free_function *free_function, - buffer_dump_function *dump_function, - struct buffer_pool **pool_ptr); - -/** - * Free a buffer pool. This will free all the elements of the pool as well. - * - * @param [in] pool The pool to free - **/ -void free_buffer_pool(struct buffer_pool *pool); - -/** - * Dump a buffer pool to the log. - * - * @param [in] pool The buffer pool to allocate from - * @param [in] dump_elements True for complete output, or false for a - * one-line summary - **/ -void dump_buffer_pool(struct buffer_pool *pool, bool dump_elements); - -/** - * Acquires a free buffer from the free list of the pool and - * returns it's associated data. - * - * @param [in] pool The buffer pool to allocate from - * @param [out] data_ptr A pointer to hold the buffer data - * - * @return a success or error code - */ -int __must_check -alloc_buffer_from_pool(struct buffer_pool *pool, void **data_ptr); - -/** - * Returns a buffer to the free list of a pool - * - * @param [in] pool The buffer pool to return the buffer to - * @param [in] data The buffer data to return - */ -void free_buffer_to_pool(struct buffer_pool *pool, void *data); - -/** - * Returns a set of buffers to the free list of a pool - * - * @param [in] pool The buffer pool to return the buffer to - * @param [in] data The buffer data to return - * @param [in] count Number of entries in the data array - */ -void free_buffers_to_pool(struct buffer_pool *pool, void **data, int count); - -/** - * Control structure for freeing (releasing back to the pool) pointers - * in batches. - * - * Since the objects stored in a buffer pool are completely opaque, - * some external data structure is needed to manage a collection of - * them. This is a simple helper for doing that, since we're freeing - * batches of objects in a couple different places. Within the pool - * itself there's a pair of linked lists, but getting at them requires - * the locking that we're trying to minimize. - * - * We collect pointers until the array is full or until there are no - * more available, and we call free_buffers_to_pool to release a batch - * all at once. - **/ -struct free_buffer_pointers { - struct buffer_pool *pool; - int index; - void *pointers[30]; // size is arbitrary -}; - -/** - * Initialize the control structure for batching buffer pointers to be - * released to their pool. - * - * @param [out] fbp The (caller-allocated) control structure - * @param [in] pool The buffer pool to return objects to. - **/ -static inline void init_free_buffer_pointers(struct free_buffer_pointers *fbp, - struct buffer_pool *pool) -{ - fbp->index = 0; - fbp->pool = pool; -} - -/** - * Release any buffers left in the collection. - * - * @param [in] fbp The control structure - **/ -static inline void free_buffer_pointers(struct free_buffer_pointers *fbp) -{ - free_buffers_to_pool(fbp->pool, fbp->pointers, fbp->index); - fbp->index = 0; -} - -/** - * Add another buffer pointer to the collection, and if we're full, - * release the whole batch to the pool. - * - * @param [in] fbp The control structure - * @param [in] pointer The buffer pointer to release - **/ -static inline void add_free_buffer_pointer(struct free_buffer_pointers *fbp, - void *pointer) -{ - fbp->pointers[fbp->index] = pointer; - fbp->index++; - if (fbp->index == ARRAY_SIZE(fbp->pointers)) { - free_buffer_pointers(fbp); - } -} - -#endif /* BUFFERPOOL_H */ diff --git a/vdo/buffered-reader.c b/vdo/buffered-reader.c new file mode 100644 index 00000000..fb81d7f9 --- /dev/null +++ b/vdo/buffered-reader.c @@ -0,0 +1,244 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright Red Hat + */ + +#include "buffered-reader.h" + +#include "compiler.h" +#include "io-factory.h" +#include "logger.h" +#include "memory-alloc.h" +#include "numeric.h" + + +/* + * The buffered reader allows efficient I/O for IO regions. The internal + * buffer always reads aligned data from the underlying region. + */ +struct buffered_reader { + /* IO factory owning the block device */ + struct io_factory *factory; + /* The dm_bufio_client to read from */ + struct dm_bufio_client *client; + /* The current dm_buffer */ + struct dm_buffer *buffer; + /* The number of blocks that can be read from */ + sector_t limit; + /* Number of the current block */ + sector_t block_number; + /* Start of the buffer */ + byte *start; + /* End of the data read from the buffer */ + byte *end; +}; + +static void read_ahead(struct buffered_reader *reader, sector_t block_number) +{ + if (block_number < reader->limit) { + enum { MAX_READ_AHEAD = 4 }; + sector_t read_ahead = min((sector_t) MAX_READ_AHEAD, + reader->limit - block_number); + + dm_bufio_prefetch(reader->client, block_number, read_ahead); + } +} + +/* + * Make a new buffered reader. + * + * @param factory The IO factory creating the buffered reader + * @param client The dm_bufio_client to read from + * @param block_limit The number of blocks that may be read + * @param reader_ptr The pointer to hold the newly allocated buffered reader + * + * @return UDS_SUCCESS or error code + */ +int make_buffered_reader(struct io_factory *factory, + struct dm_bufio_client *client, + sector_t block_limit, + struct buffered_reader **reader_ptr) +{ + int result; + struct buffered_reader *reader = NULL; + + result = UDS_ALLOCATE(1, + struct buffered_reader, + "buffered reader", + &reader); + if (result != UDS_SUCCESS) { + return result; + } + + *reader = (struct buffered_reader) { + .factory = factory, + .client = client, + .buffer = NULL, + .limit = block_limit, + .block_number = 0, + .start = NULL, + .end = NULL, + }; + + read_ahead(reader, 0); + get_uds_io_factory(factory); + *reader_ptr = reader; + return UDS_SUCCESS; +} + +void free_buffered_reader(struct buffered_reader *reader) +{ + if (reader == NULL) { + return; + } + + if (reader->buffer != NULL) { + dm_bufio_release(reader->buffer); + } + + dm_bufio_client_destroy(reader->client); + put_uds_io_factory(reader->factory); + UDS_FREE(reader); +} + +static int position_reader(struct buffered_reader *reader, + sector_t block_number, + off_t offset) +{ + if ((reader->end == NULL) || (block_number != reader->block_number)) { + struct dm_buffer *buffer = NULL; + void *data; + + if (block_number >= reader->limit) { + return UDS_OUT_OF_RANGE; + } + + if (reader->buffer != NULL) { + dm_bufio_release(reader->buffer); + reader->buffer = NULL; + } + + data = dm_bufio_read(reader->client, block_number, &buffer); + if (IS_ERR(data)) { + return -PTR_ERR(data); + } + + reader->buffer = buffer; + reader->start = data; + if (block_number == reader->block_number + 1) { + read_ahead(reader, block_number + 1); + } + } + + reader->block_number = block_number; + reader->end = reader->start + offset; + return UDS_SUCCESS; +} + +static size_t bytes_remaining_in_read_buffer(struct buffered_reader *reader) +{ + return (reader->end == NULL ? + 0 : + reader->start + UDS_BLOCK_SIZE - reader->end); +} + +static int reset_reader(struct buffered_reader *reader) +{ + sector_t block_number; + + if (bytes_remaining_in_read_buffer(reader) > 0) { + return UDS_SUCCESS; + } + + block_number = reader->block_number; + if (reader->end != NULL) { + ++block_number; + } + + return position_reader(reader, block_number, 0); +} + +/* + * Retrieve data from a buffered reader, reading from the region when needed. + * + * @param reader The buffered reader + * @param data The buffer to read data into + * @param length The length of the data to read + * + * @return UDS_SUCCESS or an error code + */ +int read_from_buffered_reader(struct buffered_reader *reader, + void *data, + size_t length) +{ + byte *dp = data; + int result = UDS_SUCCESS; + size_t chunk; + + while (length > 0) { + result = reset_reader(reader); + if (result != UDS_SUCCESS) { + break; + } + + chunk = min(length, bytes_remaining_in_read_buffer(reader)); + memcpy(dp, reader->end, chunk); + length -= chunk; + dp += chunk; + reader->end += chunk; + } + + if (((result == UDS_OUT_OF_RANGE) || (result == UDS_END_OF_FILE)) && + (dp - (byte *) data > 0)) { + result = UDS_SHORT_READ; + } + + return result; +} + +/* + * Verify that the data currently in the buffer matches the required value. + * + * @param reader The buffered reader + * @param value The value that must match the buffer contents + * @param length The length of the value that must match + * + * @return UDS_SUCCESS or UDS_CORRUPT_DATA if the value does not match + * + * @note If the value matches, the matching contents are consumed. However, + * if the match fails, any buffer contents are left as is. + */ +int verify_buffered_data(struct buffered_reader *reader, + const void *value, + size_t length) +{ + int result = UDS_SUCCESS; + size_t chunk; + const byte *vp = value; + sector_t start_block_number = reader->block_number; + int start_offset = reader->end - reader->start; + + while (length > 0) { + result = reset_reader(reader); + if (result != UDS_SUCCESS) { + result = UDS_CORRUPT_DATA; + break; + } + + chunk = min(length, bytes_remaining_in_read_buffer(reader)); + if (memcmp(vp, reader->end, chunk) != 0) { + result = UDS_CORRUPT_DATA; + break; + } + + length -= chunk; + vp += chunk; + reader->end += chunk; + } + + if (result != UDS_SUCCESS) { + position_reader(reader, start_block_number, start_offset); + } + + return result; +} diff --git a/vdo/buffered-reader.h b/vdo/buffered-reader.h new file mode 100644 index 00000000..8d2f26d3 --- /dev/null +++ b/vdo/buffered-reader.h @@ -0,0 +1,30 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright Red Hat + */ + +#ifndef BUFFERED_READER_H +#define BUFFERED_READER_H 1 + +#include "common.h" + +struct buffered_reader; +struct dm_bufio_client; +struct io_factory; + +int __must_check make_buffered_reader(struct io_factory *factory, + struct dm_bufio_client *client, + sector_t block_limit, + struct buffered_reader **reader_ptr); + +void free_buffered_reader(struct buffered_reader *reader); + +int __must_check read_from_buffered_reader(struct buffered_reader *reader, + void *data, + size_t length); + +int __must_check verify_buffered_data(struct buffered_reader *reader, + const void *value, + size_t length); + +#endif /* BUFFERED_READER_H */ diff --git a/vdo/buffered-writer.c b/vdo/buffered-writer.c new file mode 100644 index 00000000..1b59e5d3 --- /dev/null +++ b/vdo/buffered-writer.c @@ -0,0 +1,227 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright Red Hat + */ + +#include "buffered-writer.h" + +#include "compiler.h" +#include "errors.h" +#include "io-factory.h" +#include "logger.h" +#include "memory-alloc.h" +#include "numeric.h" + +struct buffered_writer { + /* IO factory owning the block device */ + struct io_factory *factory; + /* The dm_bufio_client to write to */ + struct dm_bufio_client *client; + /* The current dm_buffer */ + struct dm_buffer *buffer; + /* The number of blocks that can be written to */ + sector_t limit; + /* Number of the current block */ + sector_t block_number; + /* Start of the buffer */ + byte *start; + /* End of the data written to the buffer */ + byte *end; + /* Error code */ + int error; +}; + +static INLINE size_t space_used_in_buffer(struct buffered_writer *writer) +{ + return writer->end - writer->start; +} + +static +size_t space_remaining_in_write_buffer(struct buffered_writer *writer) +{ + return UDS_BLOCK_SIZE - space_used_in_buffer(writer); +} + +static int __must_check prepare_next_buffer(struct buffered_writer *writer) +{ + struct dm_buffer *buffer = NULL; + void *data; + + if (writer->block_number >= writer->limit) { + writer->error = UDS_OUT_OF_RANGE; + return UDS_OUT_OF_RANGE; + } + + data = dm_bufio_new(writer->client, writer->block_number, &buffer); + if (IS_ERR(data)) { + writer->error = -PTR_ERR(data); + return writer->error; + } + + writer->buffer = buffer; + writer->start = data; + writer->end = data; + return UDS_SUCCESS; +} + +static int flush_previous_buffer(struct buffered_writer *writer) +{ + size_t available; + + if (writer->buffer == NULL) { + return writer->error; + } + + if (writer->error == UDS_SUCCESS) { + available = space_remaining_in_write_buffer(writer); + + if (available > 0) { + memset(writer->end, 0, available); + } + + dm_bufio_mark_buffer_dirty(writer->buffer); + } + + dm_bufio_release(writer->buffer); + writer->buffer = NULL; + writer->start = NULL; + writer->end = NULL; + writer->block_number++; + return writer->error; +} + +/* + * Make a new buffered writer. + * + * @param factory The IO factory creating the buffered writer + * @param client The dm_bufio_client to write to + * @param block_limit The number of blocks that may be written to + * @param writer_ptr The new buffered writer goes here + * + * @return UDS_SUCCESS or an error code + */ +int make_buffered_writer(struct io_factory *factory, + struct dm_bufio_client *client, + sector_t block_limit, + struct buffered_writer **writer_ptr) +{ + int result; + struct buffered_writer *writer; + + result = UDS_ALLOCATE(1, + struct buffered_writer, + "buffered writer", + &writer); + if (result != UDS_SUCCESS) { + return result; + } + + *writer = (struct buffered_writer) { + .factory = factory, + .client = client, + .buffer = NULL, + .limit = block_limit, + .start = NULL, + .end = NULL, + .block_number = 0, + .error = UDS_SUCCESS, + }; + + get_uds_io_factory(factory); + *writer_ptr = writer; + return UDS_SUCCESS; +} + +void free_buffered_writer(struct buffered_writer *writer) +{ + int result; + + if (writer == NULL) { + return; + } + + flush_previous_buffer(writer); + result = -dm_bufio_write_dirty_buffers(writer->client); + if (result != UDS_SUCCESS) { + uds_log_warning_strerror(result, + "%s: failed to sync storage", + __func__); + } + + dm_bufio_client_destroy(writer->client); + put_uds_io_factory(writer->factory); + UDS_FREE(writer); +} + +/* + * Append data to the buffer, writing as needed. If a write error occurs, it + * is recorded and returned on every subsequent write attempt. + */ +int write_to_buffered_writer(struct buffered_writer *writer, + const void *data, + size_t len) +{ + const byte *dp = data; + int result = UDS_SUCCESS; + size_t chunk; + + if (writer->error != UDS_SUCCESS) { + return writer->error; + } + + while ((len > 0) && (result == UDS_SUCCESS)) { + if (writer->buffer == NULL) { + result = prepare_next_buffer(writer); + continue; + } + + chunk = min(len, space_remaining_in_write_buffer(writer)); + memcpy(writer->end, dp, chunk); + len -= chunk; + dp += chunk; + writer->end += chunk; + + if (space_remaining_in_write_buffer(writer) == 0) { + result = flush_buffered_writer(writer); + } + } + + return result; +} + +int write_zeros_to_buffered_writer(struct buffered_writer *writer, size_t len) +{ + int result = UDS_SUCCESS; + size_t chunk; + + if (writer->error != UDS_SUCCESS) { + return writer->error; + } + + while ((len > 0) && (result == UDS_SUCCESS)) { + if (writer->buffer == NULL) { + result = prepare_next_buffer(writer); + continue; + } + + chunk = min(len, space_remaining_in_write_buffer(writer)); + memset(writer->end, 0, chunk); + len -= chunk; + writer->end += chunk; + + if (space_remaining_in_write_buffer(writer) == 0) { + result = flush_buffered_writer(writer); + } + } + + return result; +} + +int flush_buffered_writer(struct buffered_writer *writer) +{ + if (writer->error != UDS_SUCCESS) { + return writer->error; + } + + return flush_previous_buffer(writer); +} diff --git a/vdo/buffered-writer.h b/vdo/buffered-writer.h new file mode 100644 index 00000000..96131458 --- /dev/null +++ b/vdo/buffered-writer.h @@ -0,0 +1,31 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright Red Hat + */ + +#ifndef BUFFERED_WRITER_H +#define BUFFERED_WRITER_H 1 + +#include "common.h" + +struct buffered_writer; +struct dm_bufio_client; +struct io_factory; + +int __must_check make_buffered_writer(struct io_factory *factory, + struct dm_bufio_client *client, + sector_t block_limit, + struct buffered_writer **writer_ptr); + +void free_buffered_writer(struct buffered_writer *buffer); + +int __must_check write_to_buffered_writer(struct buffered_writer *writer, + const void *data, + size_t len); + +int __must_check write_zeros_to_buffered_writer(struct buffered_writer *writer, + size_t len); + +int __must_check flush_buffered_writer(struct buffered_writer *writer); + +#endif /* BUFFERED_WRITER_H */ diff --git a/vdo/chapter-index.c b/vdo/chapter-index.c new file mode 100644 index 00000000..04b3a72e --- /dev/null +++ b/vdo/chapter-index.c @@ -0,0 +1,348 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright Red Hat + */ + +#include "chapter-index.h" + +#include "compiler.h" +#include "errors.h" +#include "hash-utils.h" +#include "logger.h" +#include "memory-alloc.h" +#include "permassert.h" +#include "uds.h" + +int make_open_chapter_index(struct open_chapter_index **chapter_index, + const struct geometry *geometry, + uint64_t volume_nonce) +{ + int result; + size_t memory_size; + struct delta_index_stats stats; + struct open_chapter_index *index; + + result = UDS_ALLOCATE(1, + struct open_chapter_index, + "open chapter index", + &index); + if (result != UDS_SUCCESS) { + return result; + } + + /* + * The delta index will rebalance delta lists when memory gets tight, + * so give the chapter index one extra page. + */ + memory_size = ((geometry->index_pages_per_chapter + 1) * + geometry->bytes_per_page); + index->geometry = geometry; + index->volume_nonce = volume_nonce; + result = initialize_delta_index(&index->delta_index, + 1, + geometry->delta_lists_per_chapter, + geometry->chapter_mean_delta, + geometry->chapter_payload_bits, + memory_size); + if (result != UDS_SUCCESS) { + UDS_FREE(index); + return result; + } + + get_delta_index_stats(&index->delta_index, &stats); + index->memory_allocated = + stats.memory_allocated + sizeof(struct open_chapter_index); + *chapter_index = index; + return UDS_SUCCESS; +} + +void free_open_chapter_index(struct open_chapter_index *chapter_index) +{ + if (chapter_index == NULL) { + return; + } + + uninitialize_delta_index(&chapter_index->delta_index); + UDS_FREE(chapter_index); +} + +/* Re-initialize an open chapter index for a new chapter. */ +void empty_open_chapter_index(struct open_chapter_index *chapter_index, + uint64_t virtual_chapter_number) +{ + empty_delta_index(&chapter_index->delta_index); + chapter_index->virtual_chapter_number = virtual_chapter_number; +} + +static INLINE bool was_entry_found(const struct delta_index_entry *entry, + unsigned int address) +{ + return (!entry->at_end && (entry->key == address)); +} + +/* Associate a chunk name with the record page containing its metadata. */ +int put_open_chapter_index_record(struct open_chapter_index *chapter_index, + const struct uds_chunk_name *name, + unsigned int page_number) +{ + int result; + struct delta_index_entry entry; + unsigned int address; + unsigned int list_number; + const byte *found_name; + bool found; + const struct geometry *geometry = chapter_index->geometry; + unsigned int chapter_number = chapter_index->virtual_chapter_number; + unsigned int record_pages = geometry->record_pages_per_chapter; + + result = ASSERT_WITH_ERROR_CODE(page_number < record_pages, + UDS_INVALID_ARGUMENT, + "Page number within chapter (%u) exceeds the maximum value %u", + page_number, + record_pages); + if (result != UDS_SUCCESS) { + return result; + } + + address = hash_to_chapter_delta_address(name, geometry); + list_number = hash_to_chapter_delta_list(name, geometry); + result = get_delta_index_entry(&chapter_index->delta_index, + list_number, + address, + name->name, + &entry); + if (result != UDS_SUCCESS) { + return result; + } + + found = was_entry_found(&entry, address); + result = ASSERT_WITH_ERROR_CODE(!(found && entry.is_collision), + UDS_BAD_STATE, + "Chunk appears more than once in chapter %llu", + (unsigned long long) chapter_number); + if (result != UDS_SUCCESS) { + return result; + } + + found_name = (found ? name->name : NULL); + return put_delta_index_entry(&entry, address, page_number, found_name); +} + +/* + * Pack a section of an open chapter index into a chapter index page. A + * range of delta lists (starting with a specified list index) is copied + * from the open chapter index into a memory page. The number of lists + * copied onto the page is returned to the caller on success. + * + * @param chapter_index The open chapter index + * @param memory The memory page to use + * @param first_list The first delta list number to be copied + * @param last_page If true, this is the last page of the chapter index + * and all the remaining lists must be packed onto this + * page + * @param num_lists The number of delta lists that were copied + **/ +int pack_open_chapter_index_page(struct open_chapter_index *chapter_index, + byte *memory, + unsigned int first_list, + bool last_page, + unsigned int *num_lists) +{ + int result; + struct delta_index *delta_index = &chapter_index->delta_index; + struct delta_index_stats stats; + uint64_t nonce = chapter_index->volume_nonce; + uint64_t chapter_number = chapter_index->virtual_chapter_number; + const struct geometry *geometry = chapter_index->geometry; + unsigned int list_count = geometry->delta_lists_per_chapter; + unsigned int removals = 0; + struct delta_index_entry entry; + unsigned int next_list; + int list_number; + + for (;;) { + result = pack_delta_index_page(delta_index, + nonce, + memory, + geometry->bytes_per_page, + chapter_number, + first_list, + num_lists); + if (result != UDS_SUCCESS) { + return result; + } + if ((first_list + *num_lists) == list_count) { + /* All lists are packed. */ + break; + } else if (*num_lists == 0) { + /* + * The next delta list does not fit on a page. This + * delta list will be removed. + */ + } else if (last_page) { + /* + * This is the last page and there are lists left + * unpacked, but all of the remaining lists must fit on + * the page. Find a list that contains entries and + * remove the entire list. Try the first list that does + * not fit. If it is empty, we will select the last list + * that already fits and has any entries. + */ + } else { + /* This page is done. */ + break; + } + + if (removals == 0) { + get_delta_index_stats(delta_index, &stats); + uds_log_warning("The chapter index for chapter %llu contains %ld entries with %ld collisions", + (unsigned long long) chapter_number, + stats.record_count, + stats.collision_count); + } + + list_number = *num_lists; + do { + if (list_number < 0) { + return UDS_OVERFLOW; + } + + next_list = first_list + list_number--, + result = start_delta_index_search(delta_index, + next_list, + 0, + &entry); + if (result != UDS_SUCCESS) { + return result; + } + + result = next_delta_index_entry(&entry); + if (result != UDS_SUCCESS) { + return result; + } + } while (entry.at_end); + + do { + result = remove_delta_index_entry(&entry); + if (result != UDS_SUCCESS) { + return result; + } + removals++; + } while (!entry.at_end); + } + + if (removals > 0) { + uds_log_warning("To avoid chapter index page overflow in chapter %llu, %u entries were removed from the chapter index", + (unsigned long long) chapter_number, + removals); + } + + return UDS_SUCCESS; +} + +/* + * Make a new chapter index page, initializing it with the data from the + * given index_page buffer. + */ +int initialize_chapter_index_page(struct delta_index_page *index_page, + const struct geometry *geometry, + byte *page_buffer, + uint64_t volume_nonce) +{ + return initialize_delta_index_page(index_page, + volume_nonce, + geometry->chapter_mean_delta, + geometry->chapter_payload_bits, + page_buffer, + geometry->bytes_per_page); +} + +/* Validate a chapter index page read during rebuild. */ +int validate_chapter_index_page(const struct delta_index_page *index_page, + const struct geometry *geometry) +{ + int result; + const struct delta_index *delta_index = &index_page->delta_index; + unsigned int first = index_page->lowest_list_number; + unsigned int last = index_page->highest_list_number; + unsigned int list_number; + + /* We walk every delta list from start to finish. */ + for (list_number = first; list_number <= last; list_number++) { + struct delta_index_entry entry; + result = start_delta_index_search(delta_index, + list_number - first, + 0, + &entry); + if (result != UDS_SUCCESS) { + return result; + } + + for (;;) { + result = next_delta_index_entry(&entry); + if (result != UDS_SUCCESS) { + /* + * A random bit stream is highly likely + * to arrive here when we go past the + * end of the delta list. + */ + return result; + } + + if (entry.at_end) { + break; + } + + /* + * Also make sure that the record page field contains a + * plausible value. + */ + if (get_delta_entry_value(&entry) >= + geometry->record_pages_per_chapter) { + /* + * Do not log this as an error. It happens in + * normal operation when we are doing a rebuild + * but haven't written the entire volume once. + */ + return UDS_CORRUPT_DATA; + } + } + } + return UDS_SUCCESS; +} + +/* + * Search a chapter index page for a chunk name, returning the record page + * number that may contain the name. + */ +int search_chapter_index_page(struct delta_index_page *index_page, + const struct geometry *geometry, + const struct uds_chunk_name *name, + int *record_page_ptr) +{ + int result; + struct delta_index *delta_index = &index_page->delta_index; + unsigned int address = hash_to_chapter_delta_address(name, geometry); + unsigned int delta_list_number = + hash_to_chapter_delta_list(name, geometry); + unsigned int sub_list_number = + delta_list_number - index_page->lowest_list_number; + struct delta_index_entry entry; + + result = get_delta_index_entry(delta_index, + sub_list_number, + address, + name->name, + &entry); + if (result != UDS_SUCCESS) { + return result; + } + + if (was_entry_found(&entry, address)) { + *record_page_ptr = get_delta_entry_value(&entry); + } else { + *record_page_ptr = NO_CHAPTER_INDEX_ENTRY; + } + + return UDS_SUCCESS; +} diff --git a/vdo/chapter-index.h b/vdo/chapter-index.h new file mode 100644 index 00000000..0d902011 --- /dev/null +++ b/vdo/chapter-index.h @@ -0,0 +1,66 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright Red Hat + */ + +#ifndef CHAPTER_INDEX_H +#define CHAPTER_INDEX_H 1 + +#include "delta-index.h" +#include "geometry.h" + +enum { + /* + * The value returned as the record page number when an entry is not + * found in the chapter index. + */ + NO_CHAPTER_INDEX_ENTRY = -1 +}; + +struct open_chapter_index { + const struct geometry *geometry; + struct delta_index delta_index; + uint64_t virtual_chapter_number; + uint64_t volume_nonce; + size_t memory_allocated; +}; + +int __must_check +make_open_chapter_index(struct open_chapter_index **chapter_index, + const struct geometry *geometry, + uint64_t volume_nonce); + +void free_open_chapter_index(struct open_chapter_index *chapter_index); + +void empty_open_chapter_index(struct open_chapter_index *chapter_index, + uint64_t virtual_chapter_number); + +int __must_check +put_open_chapter_index_record(struct open_chapter_index *chapter_index, + const struct uds_chunk_name *name, + unsigned int page_number); + +int __must_check +pack_open_chapter_index_page(struct open_chapter_index *chapter_index, + byte *memory, + unsigned int first_list, + bool last_page, + unsigned int *num_lists); + +int __must_check +initialize_chapter_index_page(struct delta_index_page *index_page, + const struct geometry *geometry, + byte *page_buffer, + uint64_t volume_nonce); + +int __must_check +validate_chapter_index_page(const struct delta_index_page *index_page, + const struct geometry *geometry); + +int __must_check +search_chapter_index_page(struct delta_index_page *index_page, + const struct geometry *geometry, + const struct uds_chunk_name *name, + int *record_page_ptr); + +#endif /* CHAPTER_INDEX_H */ diff --git a/vdo/checksum.h b/vdo/checksum.h deleted file mode 100644 index 3468ac67..00000000 --- a/vdo/checksum.h +++ /dev/null @@ -1,59 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/checksum.h#8 $ - */ - -#ifndef CHECKSUM_H -#define CHECKSUM_H - -#include - -/** - * A CRC-32 checksum - **/ -typedef uint32_t crc32_checksum_t; - -static const crc32_checksum_t VDO_INITIAL_CHECKSUM = 0xffffffff; - -enum { - /* The size of a CRC-32 checksum */ - VDO_CHECKSUM_SIZE = sizeof(crc32_checksum_t), -}; - -/** - * A function to update a running CRC-32 checksum. - * - * @param crc The current value of the crc - * @param buffer The data to add to the checksum - * @param length The length of the data - * - * @return The updated value of the checksum - **/ -static inline crc32_checksum_t vdo_update_crc32(crc32_checksum_t crc, - const byte *buffer, - size_t length) -{ - /* - * The kernel's CRC 32 implementation does not do pre- and post- - * conditioning, so do it ourselves. - */ - return crc32(crc ^ 0xffffffff, buffer, length) ^ 0xffffffff; -} - -#endif // CHECKSUM_H diff --git a/vdo/common.h b/vdo/common.h new file mode 100644 index 00000000..1e6aceea --- /dev/null +++ b/vdo/common.h @@ -0,0 +1,26 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright Red Hat + */ + +#ifndef COMMON_H +#define COMMON_H + +#include "string-utils.h" +#include "type-defs.h" +#include "uds.h" + +enum { + KILOBYTE = 1024, + MEGABYTE = KILOBYTE * KILOBYTE, + GIGABYTE = KILOBYTE * MEGABYTE +}; + +struct uds_chunk_data; + +struct uds_chunk_record { + struct uds_chunk_name name; + struct uds_chunk_data data; +}; + +#endif /* COMMON_H */ diff --git a/vdo/compiler.h b/vdo/compiler.h new file mode 100644 index 00000000..6b299b80 --- /dev/null +++ b/vdo/compiler.h @@ -0,0 +1,31 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright Red Hat + */ + +#ifndef COMMON_COMPILER_H +#define COMMON_COMPILER_H + +#include +#include + + +#define const_container_of(ptr, type, member) \ + __extension__({ \ + const __typeof__(((type *) 0)->member) *__mptr = (ptr); \ + (const type *) ((const char *) __mptr - \ + offsetof(type, member)); \ + }) + +/* + * The "inline" keyword alone takes effect only when the optimization level + * is high enough. Define INLINE to force the gcc to "always inline". + */ +#define INLINE __attribute__((always_inline)) inline + + + +#define __STRING(x) #x + + +#endif /* COMMON_COMPILER_H */ diff --git a/vdo/completion.c b/vdo/completion.c index 29d9077c..48f31f0a 100644 --- a/vdo/completion.c +++ b/vdo/completion.c @@ -1,42 +1,37 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/completion.c#21 $ */ #include "completion.h" +#include + #include "logger.h" #include "permassert.h" -#include "statusCodes.h" + +#include "kernel-types.h" +#include "status-codes.h" +#include "thread-config.h" +#include "vio.h" #include "vdo.h" static const char *VDO_COMPLETION_TYPE_NAMES[] = { - // Keep VDO_UNSET_COMPLETION_TYPE at the top. + /* Keep VDO_UNSET_COMPLETION_TYPE at the top. */ "VDO_UNSET_COMPLETION_TYPE", - // Keep this block in sorted order. If you add or remove an - // entry, be sure to update the corresponding list in completion.h. + /* + * Keep this block in sorted order. If you add or remove an + * entry, be sure to update the corresponding list in completion.h. + */ "VDO_ACTION_COMPLETION", "VDO_ADMIN_COMPLETION", "VDO_BLOCK_ALLOCATOR_COMPLETION", "VDO_BLOCK_MAP_RECOVERY_COMPLETION", + "VDO_DATA_VIO_POOL_COMPLETION", + "VDO_DEDUPE_INDEX_COMPLETION", "VDO_EXTENT_COMPLETION", + "VDO_FLUSH_COMPLETION", "VDO_FLUSH_NOTIFICATION_COMPLETION", "VDO_GENERATION_FLUSHED_COMPLETION", "VDO_LOCK_COUNTER_COMPLETION", @@ -53,36 +48,51 @@ static const char *VDO_COMPLETION_TYPE_NAMES[] = { }; -/**********************************************************************/ -void initialize_vdo_completion(struct vdo_completion *completion, +/** + * vdo_initialize_completion() - Initialize a completion to a clean state, + * for reused completions. + * @completion: The completion to initialize. + * @vdo: The VDO instance. + * @type: The type of the completion. + */ +void vdo_initialize_completion(struct vdo_completion *completion, struct vdo *vdo, enum vdo_completion_type type) { memset(completion, 0, sizeof(*completion)); completion->vdo = vdo; completion->type = type; - reset_vdo_completion(completion); + vdo_reset_completion(completion); } -/**********************************************************************/ -void reset_vdo_completion(struct vdo_completion *completion) +/** + * vdo_reset_completion() - Reset a completion to a clean state, while + * keeping the type, vdo and parent information. + * @completion: The completion to reset. + */ +void vdo_reset_completion(struct vdo_completion *completion) { completion->result = VDO_SUCCESS; completion->complete = false; } /** - * Assert that a completion is not complete. - * - * @param completion The completion to check - **/ + * assert_incomplete() - Assert that a completion is not complete. + * @completion: The completion to check. + */ static inline void assert_incomplete(struct vdo_completion *completion) { ASSERT_LOG_ONLY(!completion->complete, "completion is not complete"); } -/**********************************************************************/ -void set_vdo_completion_result(struct vdo_completion *completion, int result) +/** + * vdo_set_completion_result() - Set the result of a completion. + * @completion: The completion whose result is to be set. + * @result: The result to set. + * + * Older errors will not be masked. + */ +void vdo_set_completion_result(struct vdo_completion *completion, int result) { assert_incomplete(completion); if (completion->result == VDO_SUCCESS) { @@ -91,85 +101,110 @@ void set_vdo_completion_result(struct vdo_completion *completion, int result) } /** - * Check whether a completion's callback must be enqueued, or if it can be run - * on the current thread. Side effect: clears the requeue flag if it is set, - * so the caller MUST requeue if this returns true. - * - * @param completion The completion whose callback is to be invoked + * vdo_invoke_completion_callback_with_priority() - Invoke the callback of + * a completion. + * @completion: The completion whose callback is to be invoked. + * @priority: The priority at which to enqueue the completion. * - * @return false if the callback must be run on this thread - * true if the callback must be enqueued - **/ -static inline bool __must_check -requires_enqueue(struct vdo_completion *completion) + * If called on the correct thread (i.e. the one specified in the + * completion's callback_thread_id field), the completion will be run + * immediately. Otherwise, the completion will be enqueued on the + * correct callback thread. + */ +void +vdo_invoke_completion_callback_with_priority(struct vdo_completion *completion, + enum vdo_completion_priority priority) { thread_id_t callback_thread = completion->callback_thread_id; - if (completion->requeue) { - completion->requeue = false; - return true; - } - return (callback_thread != vdo_get_callback_thread_id()); -} - -/**********************************************************************/ -void invoke_vdo_completion_callback(struct vdo_completion *completion) -{ - if (requires_enqueue(completion)) { - enqueue_vdo_completion(completion); + if (completion->requeue || + (callback_thread != vdo_get_callback_thread_id())) { + vdo_enqueue_completion_with_priority(completion, priority); return; } - run_vdo_completion_callback(completion); + vdo_run_completion_callback(completion); } -/**********************************************************************/ -void continue_vdo_completion(struct vdo_completion *completion, int result) +/** + * vdo_continue_completion() - Continue processing a completion. + * @completion: The completion to continue. + * @result: The current result (will not mask older errors). + * + * Continue processing a completion by setting the current result and calling + * vdo_invoke_completion_callback(). + */ +void vdo_continue_completion(struct vdo_completion *completion, int result) { - set_vdo_completion_result(completion, result); - invoke_vdo_completion_callback(completion); + vdo_set_completion_result(completion, result); + vdo_invoke_completion_callback(completion); } -/**********************************************************************/ -void complete_vdo_completion(struct vdo_completion *completion) +/** + * vdo_complete_completion() - Complete a completion. + * + * @completion: The completion to complete. + */ +void vdo_complete_completion(struct vdo_completion *completion) { assert_incomplete(completion); completion->complete = true; if (completion->callback != NULL) { - invoke_vdo_completion_callback(completion); + vdo_invoke_completion_callback(completion); } } -/**********************************************************************/ -void finish_vdo_completion_parent_callback(struct vdo_completion *completion) +/** + * vdo_finish_completion_parent_callback() - A callback to finish the parent + * of a completion. + * @completion: The completion which has finished and whose parent should + * be finished. + */ +void vdo_finish_completion_parent_callback(struct vdo_completion *completion) { - finish_vdo_completion((struct vdo_completion *) completion->parent, + vdo_finish_completion((struct vdo_completion *) completion->parent, completion->result); } -/**********************************************************************/ +/** + * vdo_preserve_completion_error_and_continue() - Error handler. + * @completion: The completion which failed. + * + * Error handler which preserves an error in the parent (if there is + * one), and then resets the failing completion and calls its + * non-error callback. + */ void -preserve_vdo_completion_error_and_continue(struct vdo_completion *completion) +vdo_preserve_completion_error_and_continue(struct vdo_completion *completion) { if (completion->parent != NULL) { - set_vdo_completion_result(completion->parent, completion->result); + vdo_set_completion_result(completion->parent, completion->result); } - reset_vdo_completion(completion); - invoke_vdo_completion_callback(completion); + vdo_reset_completion(completion); + vdo_invoke_completion_callback(completion); } -/**********************************************************************/ -const char * -get_vdo_completion_type_name(enum vdo_completion_type completion_type) +/** + * get_completion_type_name() - Return the name of a completion type. + * @completion_type: The completion type. + * + * Return: a pointer to a static string; if the completion_type is unknown + * this is to a static buffer that may be overwritten. + */ +static const char * +get_completion_type_name(enum vdo_completion_type completion_type) { - // Try to catch failures to update the array when the enum values - // change. - STATIC_ASSERT(COUNT_OF(VDO_COMPLETION_TYPE_NAMES) == + /* + * Try to catch failures to update the array when the enum values + * change. + */ + STATIC_ASSERT(ARRAY_SIZE(VDO_COMPLETION_TYPE_NAMES) == (VDO_MAX_COMPLETION_TYPE - VDO_UNSET_COMPLETION_TYPE)); if (completion_type >= VDO_MAX_COMPLETION_TYPE) { static char numeric[100]; + snprintf(numeric, 99, "%d (%#x)", @@ -181,12 +216,61 @@ get_vdo_completion_type_name(enum vdo_completion_type completion_type) return VDO_COMPLETION_TYPE_NAMES[completion_type]; } -/**********************************************************************/ -int assert_vdo_completion_type(enum vdo_completion_type actual, +/** + * vdo_noop_completion_callback() - A callback which does nothing. + * @completion: The completion being called back. + * + * This callback is intended to be set as an error handler in the + * case where an error should do nothing. + */ +void +vdo_noop_completion_callback(struct vdo_completion *completion __always_unused) +{ +} + +/** + * vdo_assert_completion_type() - Assert that a completion is of the correct + * type. + * @actual: The actual completion type. + * @expected: The expected completion type. + * + * Return: VDO_SUCCESS or VDO_PARAMETER_MISMATCH + */ +int vdo_assert_completion_type(enum vdo_completion_type actual, enum vdo_completion_type expected) { return ASSERT((expected == actual), "completion type is %s instead of %s", - get_vdo_completion_type_name(actual), - get_vdo_completion_type_name(expected)); + get_completion_type_name(actual), + get_completion_type_name(expected)); } + +/** + * vdo_enqueue_completion_with_priority() - Enqueue a completion. + * @completion: The completion to be enqueued. + * @priority: The priority at which the work should be done. + * + * A function to enqueue a vdo_completion to run on the thread + * specified by its callback_thread_id field at the specified + * priority. + */ +void vdo_enqueue_completion_with_priority(struct vdo_completion *completion, + enum vdo_completion_priority priority) +{ + struct vdo *vdo = completion->vdo; + thread_id_t thread_id = completion->callback_thread_id; + + if (ASSERT(thread_id < vdo->thread_config->thread_count, + "thread_id %u (completion type %d) is less than thread count %u", + thread_id, + completion->type, + vdo->thread_config->thread_count) != UDS_SUCCESS) { + BUG(); + } + + completion->requeue = false; + completion->priority = priority; + completion->my_queue = NULL; + enqueue_work_queue(vdo->threads[thread_id].queue, completion); +} + diff --git a/vdo/completion.h b/vdo/completion.h index 8bbef767..0169c75f 100644 --- a/vdo/completion.h +++ b/vdo/completion.h @@ -1,22 +1,6 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/completion.h#22 $ */ #ifndef COMPLETION_H @@ -24,22 +8,27 @@ #include "permassert.h" -#include "statusCodes.h" +#include "kernel-types.h" +#include "status-codes.h" #include "types.h" - #include "workQueue.h" enum vdo_completion_type { - // Keep VDO_UNSET_COMPLETION_TYPE at the top. - VDO_UNSET_COMPLETION_TYPE = 0, + /* Keep VDO_UNSET_COMPLETION_TYPE at the top. */ + VDO_UNSET_COMPLETION_TYPE, - // Keep this block in sorted order. If you add or remove an entry, be - // sure to update the corresponding list in completion.c. + /* + * Keep this block in sorted order. If you add or remove an entry, be + * sure to update the corresponding list in completion.c. + */ VDO_ACTION_COMPLETION, VDO_ADMIN_COMPLETION, VDO_BLOCK_ALLOCATOR_COMPLETION, VDO_BLOCK_MAP_RECOVERY_COMPLETION, + VDO_DATA_VIO_POOL_COMPLETION, + VDO_DEDUPE_INDEX_COMPLETION, VDO_EXTENT_COMPLETION, + VDO_FLUSH_COMPLETION, VDO_FLUSH_NOTIFICATION_COMPLETION, VDO_GENERATION_FLUSHED_COMPLETION, VDO_LOCK_COUNTER_COMPLETION, @@ -55,62 +44,65 @@ enum vdo_completion_type { VIO_COMPLETION, - // Keep VDO_MAX_COMPLETION_TYPE at the bottom. + /* Keep VDO_MAX_COMPLETION_TYPE at the bottom. */ VDO_MAX_COMPLETION_TYPE } __packed; /** - * An asynchronous VDO operation. - * - * @param completion the completion of the operation - **/ + * typedef vdo_action - An asynchronous VDO operation. + * @completion: The completion of the operation. + */ typedef void vdo_action(struct vdo_completion *completion); struct vdo_completion { - /** The type of completion this is */ + /* The type of completion this is */ enum vdo_completion_type type; - /** + /* * true once the processing of the operation is complete. * This flag should not be used by waiters external to the VDO base as * it is used to gate calling the callback. - **/ + */ bool complete; - /** + /* * If true, queue this completion on the next callback invocation, even *if it is already running on the correct thread. - **/ + */ bool requeue; - /** The ID of the thread which should run the next callback */ + /* The ID of the thread which should run the next callback */ thread_id_t callback_thread_id; - /** The result of the operation */ + /* The result of the operation */ int result; - /** The VDO on which this completion operates */ + /* The VDO on which this completion operates */ struct vdo *vdo; - /** The callback which will be called once the operation is complete */ + /* The callback which will be called once the operation is complete */ vdo_action *callback; - /** Callback which, if set, will be called if an error result is set */ + /* Callback which, if set, will be called if an error result is set */ vdo_action *error_handler; - /** The parent object, if any, that spawned this completion */ + /* The parent object, if any, that spawned this completion */ void *parent; - /** The work item for enqueuing this completion */ - struct vdo_work_item work_item; + /* Entry link for lock-free work queue */ + struct funnel_queue_entry work_queue_entry_link; + enum vdo_completion_priority priority; + struct vdo_work_queue *my_queue; + uint64_t enqueue_time; }; /** - * Actually run the callback. This function must be called from the correct - * callback thread. - **/ + * vdo_run_completion_callback() - Actually run the callback. + * + * Context: This function must be called from the correct callback thread. + */ static inline void -run_vdo_completion_callback(struct vdo_completion *completion) +vdo_run_completion_callback(struct vdo_completion *completion) { if ((completion->result != VDO_SUCCESS) && (completion->error_handler != NULL)) { @@ -121,129 +113,67 @@ run_vdo_completion_callback(struct vdo_completion *completion) completion->callback(completion); } -/** - * Set the result of a completion. Older errors will not be masked. - * - * @param completion The completion whose result is to be set - * @param result The result to set - **/ -void set_vdo_completion_result(struct vdo_completion *completion, int result); +void vdo_set_completion_result(struct vdo_completion *completion, int result); -/** - * Initialize a completion to a clean state, for reused completions. - * - * @param completion The completion to initialize - * @param vdo The VDO instance - * @param type The type of the completion - **/ -void initialize_vdo_completion(struct vdo_completion *completion, +void vdo_initialize_completion(struct vdo_completion *completion, struct vdo *vdo, enum vdo_completion_type type); -/** - * Reset a completion to a clean state, while keeping - * the type, vdo and parent information. - * - * @param completion the completion to reset - **/ -void reset_vdo_completion(struct vdo_completion *completion); +void vdo_reset_completion(struct vdo_completion *completion); -/** - * Invoke the callback of a completion. If called on the correct thread (i.e. - * the one specified in the completion's callback_thread_id field), the - * completion will be run immediately. Otherwise, the completion will be - * enqueued on the correct callback thread. - **/ -void invoke_vdo_completion_callback(struct vdo_completion *completion); +void +vdo_invoke_completion_callback_with_priority(struct vdo_completion *completion, + enum vdo_completion_priority priority); /** - * Continue processing a completion by setting the current result and calling - * invoke_vdo_completion_callback(). + * vdo_invoke_completion_callback() - Invoke the callback of a completion. + * @completion: The completion whose callback is to be invoked. * - * @param completion The completion to continue - * @param result The current result (will not mask older errors) - **/ -void continue_vdo_completion(struct vdo_completion *completion, int result); + * If called on the correct thread (i.e. the one specified in the completion's + * callback_thread_id field), the completion will be run immediately. + * Otherwise, the completion will be enqueued on the correct callback thread. + */ +static inline void +vdo_invoke_completion_callback(struct vdo_completion *completion) +{ + vdo_invoke_completion_callback_with_priority(completion, + VDO_WORK_Q_DEFAULT_PRIORITY); +} -/** - * Complete a completion. - * - * @param completion The completion to complete - **/ -void complete_vdo_completion(struct vdo_completion *completion); +void vdo_continue_completion(struct vdo_completion *completion, int result); + +void vdo_complete_completion(struct vdo_completion *completion); /** - * Finish a completion. - * - * @param completion The completion to finish - * @param result The result of the completion (will not mask older errors) - **/ -static inline void finish_vdo_completion(struct vdo_completion *completion, + * vdo_finish_completion() - Finish a completion. + * @completion: The completion to finish. + * @result: The result of the completion (will not mask older errors). + */ +static inline void vdo_finish_completion(struct vdo_completion *completion, int result) { - set_vdo_completion_result(completion, result); - complete_vdo_completion(completion); + vdo_set_completion_result(completion, result); + vdo_complete_completion(completion); } -/** - * A callback to finish the parent of a completion. - * - * @param completion The completion which has finished and whose parent should - * be finished - **/ -void finish_vdo_completion_parent_callback(struct vdo_completion *completion); +void vdo_finish_completion_parent_callback(struct vdo_completion *completion); -/** - * Error handler which preserves an error in the parent (if there is one), - * and then resets the failing completion and calls its non-error callback. - * - * @param completion The completion which failed - **/ void -preserve_vdo_completion_error_and_continue(struct vdo_completion *completion); +vdo_preserve_completion_error_and_continue(struct vdo_completion *completion); -/** - * A callback which does nothing. This callback is intended to be set as an - * error handler in the case where an error should do nothing. - * - * @param completion The completion being called back - **/ -static inline -void noop_vdo_completion_callback(struct vdo_completion *completion __always_unused) -{ -} +void vdo_noop_completion_callback(struct vdo_completion *completion); -/** - * Assert that a completion is of the correct type - * - * @param actual The actual completion type - * @param expected The expected completion type - * - * @return VDO_SUCCESS or VDO_PARAMETER_MISMATCH - **/ -int assert_vdo_completion_type(enum vdo_completion_type actual, +int vdo_assert_completion_type(enum vdo_completion_type actual, enum vdo_completion_type expected); /** - * Return the name of a completion type. - * - * @param completion_type the completion type - * - * @return a pointer to a static string; if the completion_type is unknown - * this is to a static buffer that may be overwritten. - **/ -const char * -get_vdo_completion_type_name(enum vdo_completion_type completion_type); - -/** - * Set the callback for a completion. - * - * @param completion The completion - * @param callback The callback to register - * @param thread_id The ID of the thread on which the callback should run - **/ + * vdo_set_completion_callback() - Set the callback for a completion. + * @completion: The completion. + * @callback: The callback to register. + * @thread_id: The ID of the thread on which the callback should run. + */ static inline void -set_vdo_completion_callback(struct vdo_completion *completion, +vdo_set_completion_callback(struct vdo_completion *completion, vdo_action *callback, thread_id_t thread_id) { @@ -252,100 +182,103 @@ set_vdo_completion_callback(struct vdo_completion *completion, } /** - * Set the callback for a completion and invoke it immediately. - * - * @param completion The completion - * @param callback The callback to register - * @param thread_id The ID of the thread on which the callback should run - **/ + * vdo_launch_completion_callback() - Set the callback for a completion and + * invoke it immediately. + * @completion: The completion. + * @callback: The callback to register. + * @thread_id: The ID of the thread on which the callback should run. + */ static inline void -launch_vdo_completion_callback(struct vdo_completion *completion, +vdo_launch_completion_callback(struct vdo_completion *completion, vdo_action *callback, thread_id_t thread_id) { - set_vdo_completion_callback(completion, callback, thread_id); - invoke_vdo_completion_callback(completion); + vdo_set_completion_callback(completion, callback, thread_id); + vdo_invoke_completion_callback(completion); } /** - * Set the callback and parent for a completion. - * - * @param completion The completion - * @param callback The callback to register - * @param thread_id The ID of the thread on which the callback should run - * @param parent The new parent of the completion - **/ + * vdo_set_completion_callback_with_parent() - Set the callback and parent for + * a completion. + * @completion: The completion. + * @callback: The callback to register. + * @thread_id: The ID of the thread on which the callback should run. + * @parent: The new parent of the completion. + */ static inline void -set_vdo_completion_callback_with_parent(struct vdo_completion *completion, +vdo_set_completion_callback_with_parent(struct vdo_completion *completion, vdo_action *callback, thread_id_t thread_id, void *parent) { - set_vdo_completion_callback(completion, callback, thread_id); + vdo_set_completion_callback(completion, callback, thread_id); completion->parent = parent; } /** - * Set the callback and parent for a completion and invoke the callback - * immediately. - * - * @param completion The completion - * @param callback The callback to register - * @param thread_id The ID of the thread on which the callback should run - * @param parent The new parent of the completion - **/ + * vdo_launch_completion_callback_with_parent() - Set the callback and parent + * for a completion and invoke + * the callback immediately. + * @completion: The completion. + * @callback: The callback to register. + * @thread_id: The ID of the thread on which the callback should run. + * @parent: The new parent of the completion. + */ static inline void -launch_vdo_completion_callback_with_parent(struct vdo_completion *completion, +vdo_launch_completion_callback_with_parent(struct vdo_completion *completion, vdo_action *callback, thread_id_t thread_id, void *parent) { - set_vdo_completion_callback_with_parent(completion, callback, + vdo_set_completion_callback_with_parent(completion, callback, thread_id, parent); - invoke_vdo_completion_callback(completion); + vdo_invoke_completion_callback(completion); } /** - * Prepare a completion for launch. Reset it, and then set its callback, error - * handler, callback thread, and parent. + * vdo_prepare_completion() - Prepare a completion for launch. + * @completion: The completion. + * @callback: The callback to register. + * @error_handler: The error handler to register. + * @thread_id: The ID of the thread on which the callback should run. + * @parent: The new parent of the completion. * - * @param completion The completion - * @param callback The callback to register - * @param error_handler The error handler to register - * @param thread_id The ID of the thread on which the callback should run - * @param parent The new parent of the completion - **/ -static inline void prepare_vdo_completion(struct vdo_completion *completion, + * Resets the completion, and then sets its callback, error handler, callback + * thread, and parent. + */ +static inline void vdo_prepare_completion(struct vdo_completion *completion, vdo_action *callback, vdo_action *error_handler, thread_id_t thread_id, void *parent) { - reset_vdo_completion(completion); - set_vdo_completion_callback_with_parent(completion, callback, + vdo_reset_completion(completion); + vdo_set_completion_callback_with_parent(completion, callback, thread_id, parent); completion->error_handler = error_handler; } /** - * Prepare a completion for launch ensuring that it will always be requeued. - * Reset it, and then set its callback, error handler, callback thread, and - * parent. + * vdo_prepare_completion_for_requeue() - Prepare a completion for launch + * ensuring that it will always be + * requeued. + * @completion: The completion. + * @callback: The callback to register. + * @error_handler: The error handler to register. + * @thread_id: The ID of the thread on which the callback should run. + * @parent: The new parent of the completion. * - * @param completion The completion - * @param callback The callback to register - * @param error_handler The error handler to register - * @param thread_id The ID of the thread on which the callback should run - * @param parent The new parent of the completion - **/ + * Resets the completion, and then sets its callback, error handler, callback + * thread, and parent. + */ static inline void -prepare_vdo_completion_for_requeue(struct vdo_completion *completion, +vdo_prepare_completion_for_requeue(struct vdo_completion *completion, vdo_action *callback, vdo_action *error_handler, thread_id_t thread_id, void *parent) { - prepare_vdo_completion(completion, + vdo_prepare_completion(completion, callback, error_handler, thread_id, @@ -354,29 +287,37 @@ prepare_vdo_completion_for_requeue(struct vdo_completion *completion, } /** - * Prepare a completion for launch which will complete its parent when - * finished. - * - * @param completion The completion - * @param parent The parent to complete - **/ + * vdo_prepare_completion_to_finish_parent() - Prepare a completion for launch + * which will complete its parent + * when finished. + * @completion: The completion. + * @parent: The parent to complete. + */ static inline void -prepare_vdo_completion_to_finish_parent(struct vdo_completion *completion, +vdo_prepare_completion_to_finish_parent(struct vdo_completion *completion, struct vdo_completion *parent) { - prepare_vdo_completion(completion, - finish_vdo_completion_parent_callback, - finish_vdo_completion_parent_callback, + vdo_prepare_completion(completion, + vdo_finish_completion_parent_callback, + vdo_finish_completion_parent_callback, parent->callback_thread_id, parent); } +void +vdo_enqueue_completion_with_priority(struct vdo_completion *completion, + enum vdo_completion_priority priority); + /** - * A function to enqueue a vdo_completion to run on the thread specified by its - * callback_thread_id field. - * - * @param completion The completion to be enqueued - **/ -void enqueue_vdo_completion(struct vdo_completion *completion); + * vdo_enqueue_completion() - Enqueue a vdo_completion to run on the thread + * specified by its callback_thread_id field at + * default priority. + * @completion: The completion to be enqueued. + */ +static inline void vdo_enqueue_completion(struct vdo_completion *completion) +{ + vdo_enqueue_completion_with_priority(completion, + VDO_WORK_Q_DEFAULT_PRIORITY); +} -#endif // COMPLETION_H +#endif /* COMPLETION_H */ diff --git a/vdo/compressed-block.c b/vdo/compressed-block.c new file mode 100644 index 00000000..868b0055 --- /dev/null +++ b/vdo/compressed-block.c @@ -0,0 +1,127 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright Red Hat + */ + +#include "compressed-block.h" + +#include "permassert.h" +#include "string-utils.h" + +#include "status-codes.h" + +static const struct version_number COMPRESSED_BLOCK_1_0 = { + .major_version = 1, + .minor_version = 0, +}; + +enum { + COMPRESSED_BLOCK_1_0_SIZE = 4 + 4 + (2 * VDO_MAX_COMPRESSION_SLOTS), +}; + +static uint16_t +get_compressed_fragment_size(const struct compressed_block_header *header, + byte slot) +{ + return __le16_to_cpu(header->sizes[slot]); +} + +/** + * vdo_initialize_compressed_block() - Initialize a compressed block. + * @block: The compressed block to initialize. + * @size: The size of the agent's fragment. + * + * This method initializes the compressed block in the compressed + * write agent. Because the compressor already put the agent's + * compressed fragment at the start of the compressed block's data + * field, it needn't be copied. So all we need do is initialize the + * header and set the size of the agent's fragment. + */ +void vdo_initialize_compressed_block(struct compressed_block *block, + uint16_t size) +{ + /* + * Make sure the block layout isn't accidentally changed by changing + * the length of the block header. + */ + STATIC_ASSERT_SIZEOF(struct compressed_block_header, + COMPRESSED_BLOCK_1_0_SIZE); + + block->header.version = vdo_pack_version_number(COMPRESSED_BLOCK_1_0); + block->header.sizes[0] = __cpu_to_le16(size); +} + +/** + * vdo_get_compressed_block_fragment() - Get a reference to a compressed + * fragment from a compression block. + * @mapping_state [in] The mapping state for the look up. + * @compressed_block [in] The compressed block that was read from disk. + * @fragment_offset [out] The offset of the fragment within a compressed block. + * @fragment_size [out] The size of the fragment. + * + * Return: If a valid compressed fragment is found, VDO_SUCCESS; + * otherwise, VDO_INVALID_FRAGMENT if the fragment is invalid. + */ +int vdo_get_compressed_block_fragment(enum block_mapping_state mapping_state, + struct compressed_block *block, + uint16_t *fragment_offset, + uint16_t *fragment_size) +{ + uint16_t compressed_size; + uint16_t offset = 0; + unsigned int i; + byte slot; + struct version_number version; + + if (!vdo_is_state_compressed(mapping_state)) { + return VDO_INVALID_FRAGMENT; + } + + version = vdo_unpack_version_number(block->header.version); + if (!vdo_are_same_version(version, COMPRESSED_BLOCK_1_0)) { + return VDO_INVALID_FRAGMENT; + } + + slot = vdo_get_slot_from_state(mapping_state); + if (slot >= VDO_MAX_COMPRESSION_SLOTS) { + return VDO_INVALID_FRAGMENT; + } + + compressed_size = get_compressed_fragment_size(&block->header, slot); + for (i = 0; i < slot; i++) { + offset += get_compressed_fragment_size(&block->header, i); + if (offset >= VDO_COMPRESSED_BLOCK_DATA_SIZE) { + return VDO_INVALID_FRAGMENT; + } + } + + if ((offset + compressed_size) > VDO_COMPRESSED_BLOCK_DATA_SIZE) { + return VDO_INVALID_FRAGMENT; + } + + *fragment_offset = offset; + *fragment_size = compressed_size; + return VDO_SUCCESS; +} + +/** + * vdo_put_compressed_block_fragment() - Copy a fragment into the compressed + * block. + * @block: The compressed block. + * @fragment: The number of the fragment. + * @offset: The byte offset of the fragment in the data area. + * @data: A pointer to the compressed data. + * @size: The size of the data. + * + * There is no bounds checking - the data better fit without smashing other + * stuff + */ +void vdo_put_compressed_block_fragment(struct compressed_block *block, + unsigned int fragment, + uint16_t offset, + const char *data, + uint16_t size) +{ + block->header.sizes[fragment] = __cpu_to_le16(size); + memcpy(&block->data[offset], data, size); +} diff --git a/vdo/compressed-block.h b/vdo/compressed-block.h new file mode 100644 index 00000000..291ec1e2 --- /dev/null +++ b/vdo/compressed-block.h @@ -0,0 +1,79 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright Red Hat + */ + +#ifndef COMPRESSED_BLOCK_H +#define COMPRESSED_BLOCK_H + +#include "compiler.h" + +#include "block-mapping-state.h" +#include "constants.h" +#include "header.h" +#include "types.h" + +/* + * The header of a compressed block. + */ +struct compressed_block_header { + /* + * Unsigned 32-bit major and minor versions, + * in little-endian byte order + */ + struct packed_version_number version; + + /* + * List of unsigned 16-bit compressed block sizes, + * in little-endian order + */ + __le16 sizes[VDO_MAX_COMPRESSION_SLOTS]; +} __packed; + +enum { + VDO_COMPRESSED_BLOCK_DATA_SIZE = + VDO_BLOCK_SIZE - sizeof(struct compressed_block_header), + + /* + * A compressed block is only written if we can pack at least two + * fragments into it, so a fragment which fills the entire data portion + * of a compressed block is too big. + */ + VDO_MAX_COMPRESSED_FRAGMENT_SIZE = VDO_COMPRESSED_BLOCK_DATA_SIZE - 1, +}; + +/* + * The compressed block overlay. + */ +struct compressed_block { + struct compressed_block_header header; + char data[VDO_COMPRESSED_BLOCK_DATA_SIZE]; +} __packed; + +int vdo_get_compressed_block_fragment(enum block_mapping_state mapping_state, + struct compressed_block *block, + uint16_t *fragment_offset, + uint16_t *fragment_size); + +void vdo_initialize_compressed_block(struct compressed_block *block, + uint16_t size); + +static inline void +vdo_clear_unused_compression_slots(struct compressed_block *block, + slot_number_t first_unused) +{ + if (first_unused < VDO_MAX_COMPRESSION_SLOTS) { + memset(&block->header.sizes[first_unused], + 0, + ((VDO_MAX_COMPRESSION_SLOTS - first_unused) * + sizeof(__le16))); + } +} + +void vdo_put_compressed_block_fragment(struct compressed_block *block, + unsigned int fragment, + uint16_t offset, + const char *data, + uint16_t size); + +#endif /* COMPRESSED_BLOCK_H */ diff --git a/vdo/compressedBlock.c b/vdo/compressedBlock.c deleted file mode 100644 index 7cb6955e..00000000 --- a/vdo/compressedBlock.c +++ /dev/null @@ -1,114 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/compressedBlock.c#14 $ - */ - -#include "compressedBlock.h" - -#include "memoryAlloc.h" -#include "numeric.h" -#include "permassert.h" - -#include "statusCodes.h" - -static const struct version_number COMPRESSED_BLOCK_1_0 = { - .major_version = 1, - .minor_version = 0, -}; - -enum { - COMPRESSED_BLOCK_1_0_SIZE = 4 + 4 + (2 * VDO_MAX_COMPRESSION_SLOTS), -}; - -/**********************************************************************/ -void reset_vdo_compressed_block_header(struct compressed_block_header *header) -{ - // Make sure the block layout isn't accidentally changed by changing - // the length of the block header. - STATIC_ASSERT_SIZEOF(struct compressed_block_header, - COMPRESSED_BLOCK_1_0_SIZE); - - header->version = pack_vdo_version_number(COMPRESSED_BLOCK_1_0); - memset(header->sizes, 0, sizeof(header->sizes)); -} - -/**********************************************************************/ -static uint16_t -get_compressed_fragment_size(const struct compressed_block_header *header, - byte slot) -{ - return __le16_to_cpu(header->sizes[slot]); -} - -/**********************************************************************/ -int get_vdo_compressed_block_fragment(enum block_mapping_state mapping_state, - char *buffer, - block_size_t block_size, - uint16_t *fragment_offset, - uint16_t *fragment_size) -{ - uint16_t compressed_size, offset; - unsigned int i; - byte slot; - struct version_number version; - struct compressed_block_header *header = - (struct compressed_block_header *) buffer; - - if (!vdo_is_state_compressed(mapping_state)) { - return VDO_INVALID_FRAGMENT; - } - - version = unpack_vdo_version_number(header->version); - if (!are_same_vdo_version(version, COMPRESSED_BLOCK_1_0)) { - return VDO_INVALID_FRAGMENT; - } - - slot = vdo_get_slot_from_state(mapping_state); - if (slot >= VDO_MAX_COMPRESSION_SLOTS) { - return VDO_INVALID_FRAGMENT; - } - - compressed_size = get_compressed_fragment_size(header, slot); - offset = sizeof(struct compressed_block_header); - for (i = 0; i < slot; i++) { - offset += get_compressed_fragment_size(header, i); - if (offset >= block_size) { - return VDO_INVALID_FRAGMENT; - } - } - - if ((offset + compressed_size) > block_size) { - return VDO_INVALID_FRAGMENT; - } - - *fragment_offset = offset; - *fragment_size = compressed_size; - return VDO_SUCCESS; -} - -/**********************************************************************/ -void put_vdo_compressed_block_fragment(struct compressed_block *block, - unsigned int fragment, - uint16_t offset, - const char *data, - uint16_t size) -{ - block->header.sizes[fragment] = __cpu_to_le16(size); - memcpy(&block->data[offset], data, size); -} diff --git a/vdo/compressedBlock.h b/vdo/compressedBlock.h deleted file mode 100644 index 50add700..00000000 --- a/vdo/compressedBlock.h +++ /dev/null @@ -1,97 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/compressedBlock.h#10 $ - */ - -#ifndef COMPRESSED_BLOCK_H -#define COMPRESSED_BLOCK_H - -#include "blockMappingState.h" -#include "header.h" - -/** - * The header of a compressed block. - **/ -struct compressed_block_header { - /** - * Unsigned 32-bit major and minor versions, - * in little-endian byte order - */ - struct packed_version_number version; - - /** - * List of unsigned 16-bit compressed block sizes, - * in little-endian order - */ - __le16 sizes[VDO_MAX_COMPRESSION_SLOTS]; -} __packed; - -/** - * The compressed block overlay. - **/ -struct compressed_block { - struct compressed_block_header header; - char data[]; -} __packed; - -/** - * Initializes/resets a compressed block header. - * - * @param header the header - * - * When done, the version number is set to the current version, and all - * fragments are empty. - **/ -void reset_vdo_compressed_block_header(struct compressed_block_header *header); - -/** - * Get a reference to a compressed fragment from a compression block. - * - * @param [in] mapping_state the mapping state for the look up - * @param [in] buffer buffer that contains compressed data - * @param [in] block_size size of a data block - * @param [out] fragment_offset the offset of the fragment within a - * compressed block - * @param [out] fragment_size the size of the fragment - * - * @return If a valid compressed fragment is found, VDO_SUCCESS; - * otherwise, VDO_INVALID_FRAGMENT if the fragment is invalid. - **/ -int get_vdo_compressed_block_fragment(enum block_mapping_state mapping_state, - char *buffer, - block_size_t block_size, - uint16_t *fragment_offset, - uint16_t *fragment_size); - -/** - * Copy a fragment into the compressed block. - * - * @param block the compressed block - * @param fragment the number of the fragment - * @param offset the byte offset of the fragment in the data area - * @param data a pointer to the compressed data - * @param size the size of the data - * - * @note no bounds checking -- the data better fit without smashing other stuff - **/ -void put_vdo_compressed_block_fragment(struct compressed_block *block, - unsigned int fragment, uint16_t offset, - const char *data, uint16_t size); - -#endif // COMPRESSED_BLOCK_H diff --git a/vdo/compression-state.c b/vdo/compression-state.c new file mode 100644 index 00000000..ce3ab2f7 --- /dev/null +++ b/vdo/compression-state.c @@ -0,0 +1,281 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright Red Hat + */ + +#include "compression-state.h" + +#include + +#include "data-vio.h" +#include "kernel-types.h" +#include "packer.h" +#include "types.h" +#include "vdo.h" +#include "vio.h" + +static const uint32_t STATUS_MASK = 0xff; +static const uint32_t MAY_NOT_COMPRESS_MASK = 0x80000000; + +/** + * get_vio_compression_state() - Get the compression state of a data_vio. + * @data_vio: The data_vio. + * + * Return: The compression state. + */ +struct vio_compression_state get_vio_compression_state(struct data_vio *data_vio) +{ + uint32_t packed = atomic_read(&data_vio->compression.state); + + smp_rmb(); + return (struct vio_compression_state) { + .status = packed & STATUS_MASK, + .may_not_compress = ((packed & MAY_NOT_COMPRESS_MASK) != 0), + }; +} + +/** + * pack_state() - Convert a vio_compression_state into a uint32_t which may + * be stored atomically. + * @state: The state to convert. + * + * Return: The compression state packed into a uint32_t. + */ +static uint32_t __must_check pack_state(struct vio_compression_state state) +{ + return state.status + | (state.may_not_compress ? MAY_NOT_COMPRESS_MASK : 0); +} + +/** + * set_vio_compression_state() - Set the compression state of a data_vio. + * @data_vio: The data_vio whose compression state is to be set. + * @state: The expected current state of the data_vio. + * @new_state: The state to set. + * + * Return: true if the new state was set, false if the data_vio's + * compression state did not match the expected state, and so was + * left unchanged. + */ +static bool __must_check +set_vio_compression_state(struct data_vio *data_vio, + struct vio_compression_state state, + struct vio_compression_state new_state) +{ + uint32_t actual; + uint32_t expected = pack_state(state); + uint32_t replacement = pack_state(new_state); + + /* + * Extra barriers because this was original developed using + * a CAS operation that implicitly had them. + */ + smp_mb__before_atomic(); + actual = atomic_cmpxchg(&data_vio->compression.state, + expected, replacement); + smp_mb__after_atomic(); + return (expected == actual); +} + +/** + * advance_status() - Advance to the next compression state along the + * compression path. + * @data_vio: The data_vio to advance. + * + * Return: The new compression status of the data_vio. + */ +static enum vio_compression_status advance_status(struct data_vio *data_vio) +{ + for (;;) { + struct vio_compression_state state = + get_vio_compression_state(data_vio); + struct vio_compression_state new_state = state; + + if (state.status == VIO_POST_PACKER) { + /* We're already in the last state. */ + return state.status; + } + + if (state.may_not_compress) { + /* + * Compression has been dis-allowed for this VIO, so + * skip the rest of the path and go to the end. + */ + new_state.status = VIO_POST_PACKER; + } else { + /* Go to the next state. */ + new_state.status++; + } + + if (set_vio_compression_state(data_vio, state, new_state)) { + return new_state.status; + } + + /* + * Another thread changed the state out from under us so try + * again. + */ + } +} + +/** + * may_compress_data_vio() - Check whether a data_vio may go to the compressor. + * @data_vio: The data_vio to check. + * + * Return: true if the data_vio may be compressed at this time. + */ +bool may_compress_data_vio(struct data_vio *data_vio) +{ + if (!data_vio_has_allocation(data_vio) || + data_vio_requires_fua(data_vio) || + !vdo_get_compressing(vdo_from_data_vio(data_vio))) { + /* + * If this VIO didn't get an allocation, the compressed write + * probably won't either, so don't try compressing it. Also, if + * compression is off, don't compress. + */ + set_vio_compression_done(data_vio); + return false; + } + + if (data_vio->hash_lock == NULL) { + /* + * data_vios without a hash_lock (which should be extremely + * rare) aren't able to share the packer's PBN lock, so don't + * try to compress them. + */ + set_vio_compression_done(data_vio); + return false; + } + + /* + * If the orignal bio was a discard, but we got this far because the + * discard was a partial one (r/m/w), and it is part of a larger + * discard, we cannot compress this vio. We need to make sure the vio + * completes ASAP. + * + * XXX: given the hash lock bailout, is this even possible? + */ + if ((data_vio->user_bio != NULL) && + (bio_op(data_vio->user_bio) == REQ_OP_DISCARD) && + (data_vio->remaining_discard > 0)) { + set_vio_compression_done(data_vio); + return false; + } + + return (advance_status(data_vio) == VIO_COMPRESSING); +} + +/** + * may_pack_data_vio() - Check whether a data_vio may go to the packer. + * @data_vio: The data_vio to check. + * + * Return: true if the data_vio may be packed at this time. + */ +bool may_pack_data_vio(struct data_vio *data_vio) +{ + if (!vdo_data_is_sufficiently_compressible(data_vio) || + !vdo_get_compressing(vdo_from_data_vio(data_vio)) || + get_vio_compression_state(data_vio).may_not_compress) { + /* + * If the data in this VIO doesn't compress, or compression is + * off, or compression for this VIO has been canceled, don't + * send it to the packer. + */ + set_vio_compression_done(data_vio); + return false; + } + + return true; +} + +/** + * may_vio_block_in_packer() - Check whether a data_vio which has gone + * to the packer may block there. + * @data_vio: The data_vio to check. + * + * Any cancelation after this point and before the data_vio is written + * out requires this data_vio to be picked up by the canceling + * data_vio. + * + * Return: true if the data_vio may block in the packer. + */ +bool may_vio_block_in_packer(struct data_vio *data_vio) +{ + return (advance_status(data_vio) == VIO_PACKING); +} + +/** + * may_write_compressed_data_vio() - Check whether the packer may write out a + * data_vio as part of a compressed block. + * @data_vio: The data_vio to check. + * + * Return: true if the data_vio may be written as part of a + * compressed block at this time. + */ +bool may_write_compressed_data_vio(struct data_vio *data_vio) +{ + advance_status(data_vio); + return !get_vio_compression_state(data_vio).may_not_compress; +} + +/** + * set_vio_compression_done() - Indicate that this data_vio is leaving the + * compression path. + * @data_vio: The data_vio leaving the compression path. + */ +void set_vio_compression_done(struct data_vio *data_vio) +{ + for (;;) { + struct vio_compression_state new_state = { + .status = VIO_POST_PACKER, + .may_not_compress = true, + }; + struct vio_compression_state state = + get_vio_compression_state(data_vio); + + if (state.status == VIO_POST_PACKER) { + /* The VIO is already done. */ + return; + } + + /* If compression was cancelled on this VIO, preserve that fact. */ + if (set_vio_compression_state(data_vio, state, new_state)) { + return; + } + } +} + +/** + * cancel_vio_compression() - Prevent this data_vio from being compressed + * or packed. + * @data_vio: The data_vio to cancel. + * + * Return: true if the data_vio is in the packer and the caller + * was the first caller to cancel it. + */ +bool cancel_vio_compression(struct data_vio *data_vio) +{ + struct vio_compression_state state, new_state; + + for (;;) { + state = get_vio_compression_state(data_vio); + if (state.may_not_compress || + (state.status == VIO_POST_PACKER)) { + /* + * This data_vio is already set up to not block in the + * packer. + */ + break; + } + + new_state.status = state.status; + new_state.may_not_compress = true; + + if (set_vio_compression_state(data_vio, state, new_state)) { + break; + } + } + + return ((state.status == VIO_PACKING) && !state.may_not_compress); +} diff --git a/vdo/compression-state.h b/vdo/compression-state.h new file mode 100644 index 00000000..f33fee6d --- /dev/null +++ b/vdo/compression-state.h @@ -0,0 +1,49 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright Red Hat + */ + +#ifndef COMPRESSION_STATE_H +#define COMPRESSION_STATE_H + +#include "kernel-types.h" +#include "types.h" + +/* + * Where a data_vio is on the compression path; advance_status() depends on the + * order of this enum. + */ +enum vio_compression_status { + /* A VIO which has not yet entered the compression path */ + VIO_PRE_COMPRESSOR, + /* A VIO which is in the compressor */ + VIO_COMPRESSING, + /* A VIO which is blocked in the packer */ + VIO_PACKING, + /* + * A VIO which is no longer on the compression path (and never will be) + */ + VIO_POST_PACKER, +}; + +struct vio_compression_state { + enum vio_compression_status status; + bool may_not_compress; +}; + +struct vio_compression_state __must_check +get_vio_compression_state(struct data_vio *data_vio); + +bool __must_check may_compress_data_vio(struct data_vio *data_vio); + +bool __must_check may_pack_data_vio(struct data_vio *data_vio); + +bool __must_check may_vio_block_in_packer(struct data_vio *data_vio); + +bool __must_check may_write_compressed_data_vio(struct data_vio *data_vio); + +void set_vio_compression_done(struct data_vio *data_vio); + +bool cancel_vio_compression(struct data_vio *data_vio); + +#endif /* COMPRESSION_STATE_H */ diff --git a/vdo/compressionState.c b/vdo/compressionState.c deleted file mode 100644 index d807ce68..00000000 --- a/vdo/compressionState.c +++ /dev/null @@ -1,220 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/compressionState.c#19 $ - */ - -#include "compressionState.h" - - -#include "dataVIO.h" -#include "packer.h" - -static const uint32_t STATUS_MASK = 0xff; -static const uint32_t MAY_NOT_COMPRESS_MASK = 0x80000000; - -/**********************************************************************/ -struct vio_compression_state get_vio_compression_state(struct data_vio *data_vio) -{ - uint32_t packed = atomic_read(&data_vio->compression.state); - smp_rmb(); - return (struct vio_compression_state) { - .status = packed & STATUS_MASK, - .may_not_compress = ((packed & MAY_NOT_COMPRESS_MASK) != 0), - }; -} - -/** - * Convert a vio_compression_state into a uint32_t which may be stored - * atomically. - * - * @param state The state to convert - * - * @return The compression state packed into a uint32_t - **/ -static uint32_t __must_check pack_state(struct vio_compression_state state) -{ - return state.status - | (state.may_not_compress ? MAY_NOT_COMPRESS_MASK : 0); -} - -/** - * Set the compression state of a data_vio. - * - * @param data_vio The data_vio whose compression state is to be set - * @param state The expected current state of the data_vio - * @param new_state The state to set - * - * @return true if the new state was set, false if the data_vio's - * compression state did not match the expected state, and so was - * left unchanged - **/ -static bool __must_check -set_vio_compression_state(struct data_vio *data_vio, - struct vio_compression_state state, - struct vio_compression_state new_state) -{ - uint32_t actual; - uint32_t expected = pack_state(state); - uint32_t replacement = pack_state(new_state); - - // Extra barriers because this was original developed using - // a CAS operation that implicitly had them. - smp_mb__before_atomic(); - actual = atomic_cmpxchg(&data_vio->compression.state, - expected, replacement); - smp_mb__after_atomic(); - return (expected == actual); -} - -/** - * Advance to the next compression state along the compression path. - * - * @param data_vio The data_vio to advance - * - * @return The new compression status of the data_vio - **/ -static enum vio_compression_status advance_status(struct data_vio *data_vio) -{ - for (;;) { - struct vio_compression_state state = - get_vio_compression_state(data_vio); - struct vio_compression_state new_state = state; - if (state.status == VIO_POST_PACKER) { - // We're already in the last state. - return state.status; - } - - if (state.may_not_compress) { - // Compression has been dis-allowed for this VIO, so - // skip the rest of the path and go to the end. - new_state.status = VIO_POST_PACKER; - } else { - // Go to the next state. - new_state.status++; - } - - if (set_vio_compression_state(data_vio, state, new_state)) { - return new_state.status; - } - - // Another thread changed the state out from under us so try - // again. - } -} - -/**********************************************************************/ -bool may_compress_data_vio(struct data_vio *data_vio) -{ - if (!data_vio_has_allocation(data_vio) || - vio_requires_flush_after(data_vio_as_vio(data_vio)) || - !get_vdo_compressing(get_vdo_from_data_vio(data_vio))) { - /* - * If this VIO didn't get an allocation, the compressed write - * probably won't either, so don't try compressing it. Also, if - * compression is off, don't compress. - */ - set_vio_compression_done(data_vio); - return false; - } - - if (data_vio->hash_lock == NULL) { - // data_vios without a hash_lock (which should be extremely - // rare) aren't able to share the packer's PBN lock, so don't - // try to compress them. - return false; - } - - return (advance_status(data_vio) == VIO_COMPRESSING); -} - -/**********************************************************************/ -bool may_pack_data_vio(struct data_vio *data_vio) -{ - if (!vdo_data_is_sufficiently_compressible(data_vio) || - !get_vdo_compressing(get_vdo_from_data_vio(data_vio)) || - get_vio_compression_state(data_vio).may_not_compress) { - // If the data in this VIO doesn't compress, or compression is - // off, or compression for this VIO has been canceled, don't - // send it to the packer. - set_vio_compression_done(data_vio); - return false; - } - - return true; -} - -/**********************************************************************/ -bool may_vio_block_in_packer(struct data_vio *data_vio) -{ - return (advance_status(data_vio) == VIO_PACKING); -} - -/**********************************************************************/ -bool may_write_compressed_data_vio(struct data_vio *data_vio) -{ - advance_status(data_vio); - return !get_vio_compression_state(data_vio).may_not_compress; -} - -/**********************************************************************/ -void set_vio_compression_done(struct data_vio *data_vio) -{ - for (;;) { - struct vio_compression_state new_state = { - .status = VIO_POST_PACKER, - .may_not_compress = true, - }; - struct vio_compression_state state = - get_vio_compression_state(data_vio); - - if (state.status == VIO_POST_PACKER) { - // The VIO is already done. - return; - } - - // If compression was cancelled on this VIO, preserve that fact. - if (set_vio_compression_state(data_vio, state, new_state)) { - return; - } - } -} - -/**********************************************************************/ -bool cancel_vio_compression(struct data_vio *data_vio) -{ - struct vio_compression_state state, new_state; - for (;;) { - state = get_vio_compression_state(data_vio); - if (state.may_not_compress || - (state.status == VIO_POST_PACKER)) { - // This data_vio is already set up to not block in the - // packer. - break; - } - - new_state.status = state.status; - new_state.may_not_compress = true; - - if (set_vio_compression_state(data_vio, state, new_state)) { - break; - } - } - - return ((state.status == VIO_PACKING) && !state.may_not_compress); -} diff --git a/vdo/compressionState.h b/vdo/compressionState.h deleted file mode 100644 index a3376c56..00000000 --- a/vdo/compressionState.h +++ /dev/null @@ -1,116 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/compressionState.h#11 $ - */ - -#ifndef COMPRESSION_STATE_H -#define COMPRESSION_STATE_H - -#include "types.h" - -/** - * Where a data_vio is on the compression path; advance_status() depends on the - * order of this enum. - **/ -enum vio_compression_status { - /* A VIO which has not yet entered the compression path */ - VIO_PRE_COMPRESSOR = 0, - /* A VIO which is in the compressor */ - VIO_COMPRESSING, - /* A VIO which is blocked in the packer */ - VIO_PACKING, - /* - * A VIO which is no longer on the compression path (and never will be) - */ - VIO_POST_PACKER, -}; - -struct vio_compression_state { - enum vio_compression_status status; - bool may_not_compress; -}; - -/** - * Get the compression state of a data_vio. - * - * @param data_vio The data_vio - * - * @return The compression state - **/ -struct vio_compression_state __must_check -get_vio_compression_state(struct data_vio *data_vio); - -/** - * Check whether a data_vio may go to the compressor. - * - * @param data_vio The data_vio to check - * - * @return true if the data_vio may be compressed at this time - **/ -bool __must_check may_compress_data_vio(struct data_vio *data_vio); - -/** - * Check whether a data_vio may go to the packer. - * - * @param data_vio The data_vio to check - * - * @return true if the data_vio may be packed at this time - **/ -bool __must_check may_pack_data_vio(struct data_vio *data_vio); - -/** - * Check whether a data_vio which has gone to the packer may block there. Any - * cancelation after this point and before the data_vio is written out requires - * this data_vio to be picked up by the canceling data_vio. - * - * @param data_vio The data_vio to check - * - * @return true if the data_vio may block in the packer - **/ -bool __must_check may_vio_block_in_packer(struct data_vio *data_vio); - -/** - * Check whether the packer may write out a data_vio as part of a compressed - * block. - * - * @param data_vio The data_vio to check - * - * @return true if the data_vio may be written as part of a - * compressed block at this time - **/ -bool __must_check may_write_compressed_data_vio(struct data_vio *data_vio); - -/** - * Indicate that this data_vio is leaving the compression path. - * - * @param data_vio The data_vio leaving the compression path - **/ -void set_vio_compression_done(struct data_vio *data_vio); - -/** - * Prevent this data_vio from being compressed or packed. - * - * @param data_vio The data_vio to cancel - * - * @return true if the data_vio is in the packer and the caller - * was the first caller to cancel it - **/ -bool cancel_vio_compression(struct data_vio *data_vio); - -#endif /* COMPRESSION_STATE_H */ diff --git a/vdo/config.c b/vdo/config.c new file mode 100644 index 00000000..da5699b8 --- /dev/null +++ b/vdo/config.c @@ -0,0 +1,714 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright Red Hat + */ + +#include "config.h" + +#include "buffer.h" +#include "logger.h" +#include "memory-alloc.h" +#include "string-utils.h" +#include "uds-threads.h" + +static const byte INDEX_CONFIG_MAGIC[] = "ALBIC"; +static const byte INDEX_CONFIG_VERSION_6_02[] = "06.02"; +static const byte INDEX_CONFIG_VERSION_8_02[] = "08.02"; + +enum { + DEFAULT_VOLUME_READ_THREADS = 2, + MAX_VOLUME_READ_THREADS = 16, + INDEX_CONFIG_MAGIC_LENGTH = sizeof(INDEX_CONFIG_MAGIC) - 1, + INDEX_CONFIG_VERSION_LENGTH = sizeof(INDEX_CONFIG_VERSION_6_02) - 1, +}; + +static int __must_check +decode_index_config_06_02(struct buffer *buffer, + struct uds_configuration_8_02 *config) +{ + int result; + + result = get_uint32_le_from_buffer(buffer, + &config->record_pages_per_chapter); + if (result != UDS_SUCCESS) { + return result; + } + + result = get_uint32_le_from_buffer(buffer, + &config->chapters_per_volume); + if (result != UDS_SUCCESS) { + return result; + } + + result = get_uint32_le_from_buffer(buffer, + &config->sparse_chapters_per_volume); + if (result != UDS_SUCCESS) { + return result; + } + + result = get_uint32_le_from_buffer(buffer, &config->cache_chapters); + if (result != UDS_SUCCESS) { + return result; + } + + result = skip_forward(buffer, sizeof(uint32_t)); + if (result != UDS_SUCCESS) { + return result; + } + + result = get_uint32_le_from_buffer(buffer, + &config->volume_index_mean_delta); + if (result != UDS_SUCCESS) { + return result; + } + + result = get_uint32_le_from_buffer(buffer, &config->bytes_per_page); + if (result != UDS_SUCCESS) { + return result; + } + + result = get_uint32_le_from_buffer(buffer, + &config->sparse_sample_rate); + if (result != UDS_SUCCESS) { + return result; + } + + result = get_uint64_le_from_buffer(buffer, &config->nonce); + if (result != UDS_SUCCESS) { + return result; + } + + config->remapped_virtual = 0; + config->remapped_physical = 0; + + result = ASSERT_LOG_ONLY(content_length(buffer) == 0, + "%zu bytes read but not decoded", + content_length(buffer)); + if (result != UDS_SUCCESS) { + return UDS_CORRUPT_DATA; + } + + return UDS_SUCCESS; +} + +static int __must_check +decode_index_config_08_02(struct buffer *buffer, + struct uds_configuration_8_02 *config) +{ + int result; + + result = get_uint32_le_from_buffer(buffer, + &config->record_pages_per_chapter); + if (result != UDS_SUCCESS) { + return result; + } + + result = get_uint32_le_from_buffer(buffer, + &config->chapters_per_volume); + if (result != UDS_SUCCESS) { + return result; + } + + result = get_uint32_le_from_buffer(buffer, + &config->sparse_chapters_per_volume); + if (result != UDS_SUCCESS) { + return result; + } + + result = get_uint32_le_from_buffer(buffer, &config->cache_chapters); + if (result != UDS_SUCCESS) { + return result; + } + + result = skip_forward(buffer, sizeof(uint32_t)); + if (result != UDS_SUCCESS) { + return result; + } + + result = get_uint32_le_from_buffer(buffer, + &config->volume_index_mean_delta); + if (result != UDS_SUCCESS) { + return result; + } + + result = get_uint32_le_from_buffer(buffer, &config->bytes_per_page); + if (result != UDS_SUCCESS) { + return result; + } + + result = get_uint32_le_from_buffer(buffer, + &config->sparse_sample_rate); + if (result != UDS_SUCCESS) { + return result; + } + + result = get_uint64_le_from_buffer(buffer, &config->nonce); + if (result != UDS_SUCCESS) { + return result; + } + + result = get_uint64_le_from_buffer(buffer, &config->remapped_virtual); + if (result != UDS_SUCCESS) { + return result; + } + + result = get_uint64_le_from_buffer(buffer, &config->remapped_physical); + if (result != UDS_SUCCESS) { + return result; + } + + result = ASSERT_LOG_ONLY(content_length(buffer) == 0, + "%zu bytes read but not decoded", + content_length(buffer)); + if (result != UDS_SUCCESS) { + return UDS_CORRUPT_DATA; + } + + return result; +} + +static bool is_version(const byte *version, byte *buffer) +{ + return (memcmp(version, buffer, INDEX_CONFIG_VERSION_LENGTH) == 0); +} + +static int read_version(struct buffered_reader *reader, + struct uds_configuration_8_02 *conf) +{ + byte version_buffer[INDEX_CONFIG_VERSION_LENGTH]; + struct buffer *buffer; + int result; + + result = read_from_buffered_reader(reader, + version_buffer, + INDEX_CONFIG_VERSION_LENGTH); + if (result != UDS_SUCCESS) { + return uds_log_error_strerror(result, + "cannot read index config version"); + } + + if (is_version(INDEX_CONFIG_VERSION_6_02, version_buffer)) { + result = make_buffer(sizeof(struct uds_configuration_6_02), + &buffer); + if (result != UDS_SUCCESS) { + return result; + } + + result = read_from_buffered_reader(reader, + get_buffer_contents(buffer), + buffer_length(buffer)); + if (result != UDS_SUCCESS) { + free_buffer(UDS_FORGET(buffer)); + return uds_log_error_strerror(result, + "cannot read config data"); + } + + clear_buffer(buffer); + result = decode_index_config_06_02(buffer, conf); + free_buffer(UDS_FORGET(buffer)); + } else if (is_version(INDEX_CONFIG_VERSION_8_02, version_buffer)) { + result = make_buffer(sizeof(struct uds_configuration_8_02), + &buffer); + if (result != UDS_SUCCESS) { + return result; + } + + result = read_from_buffered_reader(reader, + get_buffer_contents(buffer), + buffer_length(buffer)); + if (result != UDS_SUCCESS) { + free_buffer(UDS_FORGET(buffer)); + return uds_log_error_strerror(result, + "cannot read config data"); + } + + clear_buffer(buffer); + result = decode_index_config_08_02(buffer, conf); + free_buffer(UDS_FORGET(buffer)); + } else { + uds_log_error_strerror(result, + "unsupported configuration version: '%.*s'", + INDEX_CONFIG_VERSION_LENGTH, + version_buffer); + result = UDS_CORRUPT_DATA; + } + + return result; +} + +static bool are_matching_configurations(struct uds_configuration_8_02 *saved, + struct configuration *user) +{ + struct geometry *geometry = user->geometry; + bool result = true; + + if (saved->record_pages_per_chapter != + geometry->record_pages_per_chapter) { + uds_log_error("Record pages per chapter (%u) does not match (%u)", + saved->record_pages_per_chapter, + geometry->record_pages_per_chapter); + result = false; + } + + if (saved->chapters_per_volume != geometry->chapters_per_volume) { + uds_log_error("Chapter count (%u) does not match (%u)", + saved->chapters_per_volume, + geometry->chapters_per_volume); + result = false; + } + + if (saved->sparse_chapters_per_volume != + geometry->sparse_chapters_per_volume) { + uds_log_error("Sparse chapter count (%u) does not match (%u)", + saved->sparse_chapters_per_volume, + geometry->sparse_chapters_per_volume); + result = false; + } + + if (saved->cache_chapters != user->cache_chapters) { + uds_log_error("Cache size (%u) does not match (%u)", + saved->cache_chapters, + user->cache_chapters); + result = false; + } + + if (saved->volume_index_mean_delta != user->volume_index_mean_delta) { + uds_log_error("Volumee index mean delta (%u) does not match (%u)", + saved->volume_index_mean_delta, + user->volume_index_mean_delta); + result = false; + } + + if (saved->bytes_per_page != geometry->bytes_per_page) { + uds_log_error("Bytes per page value (%u) does not match (%zu)", + saved->bytes_per_page, + geometry->bytes_per_page); + result = false; + } + + if (saved->sparse_sample_rate != user->sparse_sample_rate) { + uds_log_error("Sparse sample rate (%u) does not match (%u)", + saved->sparse_sample_rate, + user->sparse_sample_rate); + result = false; + } + + if (saved->nonce != user->nonce) { + uds_log_error("Nonce (%llu) does not match (%llu)", + (unsigned long long) saved->nonce, + (unsigned long long) user->nonce); + result = false; + } + + return result; +} + +/* Read the configuration and validate it against the provided one. */ +int validate_config_contents(struct buffered_reader *reader, + struct configuration *config) +{ + int result; + struct uds_configuration_8_02 saved; + + result = verify_buffered_data(reader, + INDEX_CONFIG_MAGIC, + INDEX_CONFIG_MAGIC_LENGTH); + if (result != UDS_SUCCESS) { + return result; + } + + result = read_version(reader, &saved); + if (result != UDS_SUCCESS) { + uds_log_error_strerror(result, "Failed to read index config"); + return result; + } + + if (!are_matching_configurations(&saved, config)) { + uds_log_warning("Supplied configuration does not match save"); + return UDS_NO_INDEX; + } + + config->geometry->remapped_virtual = saved.remapped_virtual; + config->geometry->remapped_physical = saved.remapped_physical; + return UDS_SUCCESS; +} + +static int __must_check +encode_index_config_06_02(struct buffer *buffer, struct configuration *config) +{ + int result; + struct geometry *geometry = config->geometry; + + result = put_uint32_le_into_buffer(buffer, + geometry->record_pages_per_chapter); + if (result != UDS_SUCCESS) { + return result; + } + + result = put_uint32_le_into_buffer(buffer, + geometry->chapters_per_volume); + if (result != UDS_SUCCESS) { + return result; + } + + result = put_uint32_le_into_buffer(buffer, + geometry->sparse_chapters_per_volume); + if (result != UDS_SUCCESS) { + return result; + } + + result = put_uint32_le_into_buffer(buffer, config->cache_chapters); + if (result != UDS_SUCCESS) { + return result; + } + + result = zero_bytes(buffer, sizeof(uint32_t)); + if (result != UDS_SUCCESS) { + return result; + } + + result = put_uint32_le_into_buffer(buffer, + config->volume_index_mean_delta); + if (result != UDS_SUCCESS) { + return result; + } + + result = put_uint32_le_into_buffer(buffer, geometry->bytes_per_page); + if (result != UDS_SUCCESS) { + return result; + } + + result = put_uint32_le_into_buffer(buffer, config->sparse_sample_rate); + if (result != UDS_SUCCESS) { + return result; + } + + result = put_uint64_le_into_buffer(buffer, config->nonce); + if (result != UDS_SUCCESS) { + return result; + } + + return ASSERT_LOG_ONLY((available_space(buffer) == 0), + "%zu bytes encoded, of %zu expected", + content_length(buffer), + buffer_length(buffer)); +} + +static int __must_check +encode_index_config_08_02(struct buffer *buffer, struct configuration *config) +{ + int result; + struct geometry *geometry = config->geometry; + + result = put_uint32_le_into_buffer(buffer, + geometry->record_pages_per_chapter); + if (result != UDS_SUCCESS) { + return result; + } + + result = put_uint32_le_into_buffer(buffer, + geometry->chapters_per_volume); + if (result != UDS_SUCCESS) { + return result; + } + + result = put_uint32_le_into_buffer(buffer, + geometry->sparse_chapters_per_volume); + if (result != UDS_SUCCESS) { + return result; + } + + result = put_uint32_le_into_buffer(buffer, config->cache_chapters); + if (result != UDS_SUCCESS) { + return result; + } + + result = zero_bytes(buffer, sizeof(uint32_t)); + if (result != UDS_SUCCESS) { + return result; + } + + result = put_uint32_le_into_buffer(buffer, + config->volume_index_mean_delta); + if (result != UDS_SUCCESS) { + return result; + } + + result = put_uint32_le_into_buffer(buffer, geometry->bytes_per_page); + if (result != UDS_SUCCESS) { + return result; + } + + result = put_uint32_le_into_buffer(buffer, config->sparse_sample_rate); + if (result != UDS_SUCCESS) { + return result; + } + + result = put_uint64_le_into_buffer(buffer, config->nonce); + if (result != UDS_SUCCESS) { + return result; + } + + result = put_uint64_le_into_buffer(buffer, geometry->remapped_virtual); + if (result != UDS_SUCCESS) { + return result; + } + + result = put_uint64_le_into_buffer(buffer, + geometry->remapped_physical); + if (result != UDS_SUCCESS) { + return result; + } + + return ASSERT_LOG_ONLY((available_space(buffer) == 0), + "%zu bytes encoded, of %zu expected", + content_length(buffer), + buffer_length(buffer)); +} + +/* + * Write the configuration to stable storage. If the superblock + * version is < 4, write the 6.02 version; otherwise write the 8.02 + * version, indicating the configuration is for an index that has been + * reduced by one chapter. + */ +int write_config_contents(struct buffered_writer *writer, + struct configuration *config, + uint32_t version) +{ + int result; + struct buffer *buffer; + + result = write_to_buffered_writer(writer, + INDEX_CONFIG_MAGIC, + INDEX_CONFIG_MAGIC_LENGTH); + if (result != UDS_SUCCESS) { + return result; + } + + /* + * If version is < 4, the index has not been reduced by a + * chapter so it must be written out as version 6.02 so that + * it is still compatible with older versions of UDS. + */ + if (version < 4) { + result = write_to_buffered_writer(writer, + INDEX_CONFIG_VERSION_6_02, + INDEX_CONFIG_VERSION_LENGTH); + if (result != UDS_SUCCESS) { + return result; + } + + result = make_buffer(sizeof(struct uds_configuration_6_02), + &buffer); + if (result != UDS_SUCCESS) { + return result; + } + + result = encode_index_config_06_02(buffer, config); + if (result != UDS_SUCCESS) { + free_buffer(UDS_FORGET(buffer)); + return result; + } + } else { + result = write_to_buffered_writer(writer, + INDEX_CONFIG_VERSION_8_02, + INDEX_CONFIG_VERSION_LENGTH); + if (result != UDS_SUCCESS) { + return result; + } + + result = make_buffer(sizeof(struct uds_configuration_8_02), + &buffer); + if (result != UDS_SUCCESS) { + return result; + } + + result = encode_index_config_08_02(buffer, config); + if (result != UDS_SUCCESS) { + free_buffer(UDS_FORGET(buffer)); + return result; + } + } + + result = write_to_buffered_writer(writer, + get_buffer_contents(buffer), + content_length(buffer)); + free_buffer(UDS_FORGET(buffer)); + return result; +} + +/* Compute configuration parameters that depend on memory size. */ +static int compute_memory_sizes(uds_memory_config_size_t mem_gb, + bool sparse, + unsigned int *chapters_per_volume, + unsigned int *record_pages_per_chapter, + unsigned int *sparse_chapters_per_volume) +{ + unsigned int reduced_chapters = 0; + unsigned int base_chapters; + + if (mem_gb == UDS_MEMORY_CONFIG_256MB) { + base_chapters = DEFAULT_CHAPTERS_PER_VOLUME; + *record_pages_per_chapter = SMALL_RECORD_PAGES_PER_CHAPTER; + } else if (mem_gb == UDS_MEMORY_CONFIG_512MB) { + base_chapters = DEFAULT_CHAPTERS_PER_VOLUME; + *record_pages_per_chapter = 2 * SMALL_RECORD_PAGES_PER_CHAPTER; + } else if (mem_gb == UDS_MEMORY_CONFIG_768MB) { + base_chapters = DEFAULT_CHAPTERS_PER_VOLUME; + *record_pages_per_chapter = 3 * SMALL_RECORD_PAGES_PER_CHAPTER; + } else if ((mem_gb >= 1) && (mem_gb <= UDS_MEMORY_CONFIG_MAX)) { + base_chapters = mem_gb * DEFAULT_CHAPTERS_PER_VOLUME; + *record_pages_per_chapter = DEFAULT_RECORD_PAGES_PER_CHAPTER; + } else if (mem_gb == UDS_MEMORY_CONFIG_REDUCED_256MB) { + reduced_chapters = 1; + base_chapters = DEFAULT_CHAPTERS_PER_VOLUME; + *record_pages_per_chapter = SMALL_RECORD_PAGES_PER_CHAPTER; + } else if (mem_gb == UDS_MEMORY_CONFIG_REDUCED_512MB) { + reduced_chapters = 1; + base_chapters = DEFAULT_CHAPTERS_PER_VOLUME; + *record_pages_per_chapter = 2 * SMALL_RECORD_PAGES_PER_CHAPTER; + } else if (mem_gb == UDS_MEMORY_CONFIG_REDUCED_768MB) { + reduced_chapters = 1; + base_chapters = DEFAULT_CHAPTERS_PER_VOLUME; + *record_pages_per_chapter = 3 * SMALL_RECORD_PAGES_PER_CHAPTER; + } else if ((mem_gb >= 1 + UDS_MEMORY_CONFIG_REDUCED) && + (mem_gb <= UDS_MEMORY_CONFIG_REDUCED_MAX)) { + reduced_chapters = 1; + base_chapters = ((mem_gb - UDS_MEMORY_CONFIG_REDUCED) * + DEFAULT_CHAPTERS_PER_VOLUME); + *record_pages_per_chapter = DEFAULT_RECORD_PAGES_PER_CHAPTER; + } else { + uds_log_error("received invalid memory size"); + return -EINVAL; + } + + if (sparse) { + /* Make 95% of chapters sparse, allowing 10x more records. */ + *sparse_chapters_per_volume = (19 * base_chapters) / 2; + base_chapters *= 10; + } else { + *sparse_chapters_per_volume = 0; + } + + *chapters_per_volume = base_chapters - reduced_chapters; + return UDS_SUCCESS; +} + +static unsigned int __must_check normalize_zone_count(unsigned int requested) +{ + unsigned int zone_count = requested; + + if (zone_count == 0) { + zone_count = uds_get_num_cores() / 2; + } + + if (zone_count < 1) { + zone_count = 1; + } + + if (zone_count > MAX_ZONES) { + zone_count = MAX_ZONES; + } + + uds_log_info("Using %u indexing zone%s for concurrency.", + zone_count, + zone_count == 1 ? "" : "s"); + return zone_count; +} + +static unsigned int __must_check normalize_read_threads(unsigned int requested) +{ + unsigned int read_threads = requested; + + if (read_threads < 1) { + read_threads = DEFAULT_VOLUME_READ_THREADS; + } + + if (read_threads > MAX_VOLUME_READ_THREADS) { + read_threads = MAX_VOLUME_READ_THREADS; + } + + return read_threads; +} + +int make_configuration(const struct uds_parameters *params, + struct configuration **config_ptr) +{ + struct configuration *config; + unsigned int chapters_per_volume = 0; + unsigned int record_pages_per_chapter = 0; + unsigned int sparse_chapters_per_volume = 0; + int result; + + result = compute_memory_sizes(params->memory_size, + params->sparse, + &chapters_per_volume, + &record_pages_per_chapter, + &sparse_chapters_per_volume); + if (result != UDS_SUCCESS) { + return result; + } + + result = UDS_ALLOCATE(1, struct configuration, __func__, &config); + if (result != UDS_SUCCESS) { + return result; + } + + result = make_geometry(DEFAULT_BYTES_PER_PAGE, + record_pages_per_chapter, + chapters_per_volume, + sparse_chapters_per_volume, + 0, + 0, + &config->geometry); + if (result != UDS_SUCCESS) { + free_configuration(config); + return result; + } + + config->zone_count = normalize_zone_count(params->zone_count); + config->read_threads = normalize_read_threads(params->read_threads); + + config->cache_chapters = DEFAULT_CACHE_CHAPTERS; + config->volume_index_mean_delta = + DEFAULT_VOLUME_INDEX_MEAN_DELTA; + config->sparse_sample_rate = + (params->sparse ? DEFAULT_SPARSE_SAMPLE_RATE : 0); + config->nonce = params->nonce; + config->name = params->name; + config->offset = params->offset; + config->size = params->size; + + *config_ptr = config; + return UDS_SUCCESS; +} + +void free_configuration(struct configuration *config) +{ + if (config != NULL) { + free_geometry(config->geometry); + UDS_FREE(config); + } +} + +void log_uds_configuration(struct configuration *config) +{ + uds_log_debug("Configuration:"); + uds_log_debug(" Record pages per chapter: %10u", + config->geometry->record_pages_per_chapter); + uds_log_debug(" Chapters per volume: %10u", + config->geometry->chapters_per_volume); + uds_log_debug(" Sparse chapters per volume: %10u", + config->geometry->sparse_chapters_per_volume); + uds_log_debug(" Cache size (chapters): %10u", + config->cache_chapters); + uds_log_debug(" Volume index mean delta: %10u", + config->volume_index_mean_delta); + uds_log_debug(" Bytes per page: %10zu", + config->geometry->bytes_per_page); + uds_log_debug(" Sparse sample rate: %10u", + config->sparse_sample_rate); + uds_log_debug(" Nonce: %llu", + (unsigned long long) config->nonce); +} diff --git a/vdo/config.h b/vdo/config.h new file mode 100644 index 00000000..6af8d990 --- /dev/null +++ b/vdo/config.h @@ -0,0 +1,120 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright Red Hat + */ + +#ifndef CONFIG_H +#define CONFIG_H + +#include "buffered-reader.h" +#include "buffered-writer.h" +#include "geometry.h" +#include "uds.h" + +enum { + DEFAULT_VOLUME_INDEX_MEAN_DELTA = 4096, + DEFAULT_CACHE_CHAPTERS = 7, + DEFAULT_SPARSE_SAMPLE_RATE = 32, + MAX_ZONES = 16, +}; + +/* A set of configuration parameters for the indexer. */ +struct configuration { + /* String describing the storage device */ + const char *name; + + /* The maximum allowable size of the index */ + size_t size; + + /* The offset where the index should start */ + off_t offset; + + /* Parameters for the volume */ + + /* The volume layout */ + struct geometry *geometry; + + /* Index owner's nonce */ + uint64_t nonce; + + /* The number of threads used to process index requests */ + unsigned int zone_count; + + /* The number of threads used to read volume pages */ + unsigned int read_threads; + + /* Size of the page cache and sparse chapter index cache in chapters */ + unsigned int cache_chapters; + + /* Parameters for the volume index */ + + /* The mean delta for the volume index */ + unsigned int volume_index_mean_delta; + + /* Sampling rate for sparse indexing */ + unsigned int sparse_sample_rate; +}; + +/* On-disk structure of data for a version 8.02 index. */ +struct uds_configuration_8_02 { + /* Smaller (16), Small (64) or large (256) indices */ + unsigned int record_pages_per_chapter; + /* Total number of chapters per volume */ + unsigned int chapters_per_volume; + /* Number of sparse chapters per volume */ + unsigned int sparse_chapters_per_volume; + /* Size of the page cache, in chapters */ + unsigned int cache_chapters; + /* Unused field */ + unsigned int unused; + /* The volume index mean delta to use */ + unsigned int volume_index_mean_delta; + /* Size of a page, used for both record pages and index pages */ + unsigned int bytes_per_page; + /* Sampling rate for sparse indexing */ + unsigned int sparse_sample_rate; + /* Index owner's nonce */ + uint64_t nonce; + /* Virtual chapter remapped from physical chapter 0 */ + uint64_t remapped_virtual; + /* New physical chapter which remapped chapter was moved to */ + uint64_t remapped_physical; +}; + +/* On-disk structure of data for a version 6.02 index. */ +struct uds_configuration_6_02 { + /* Smaller (16), Small (64) or large (256) indices */ + unsigned int record_pages_per_chapter; + /* Total number of chapters per volume */ + unsigned int chapters_per_volume; + /* Number of sparse chapters per volume */ + unsigned int sparse_chapters_per_volume; + /* Size of the page cache, in chapters */ + unsigned int cache_chapters; + /* Unused field */ + unsigned int unused; + /* The volume index mean delta to use */ + unsigned int volume_index_mean_delta; + /* Size of a page, used for both record pages and index pages */ + unsigned int bytes_per_page; + /* Sampling rate for sparse indexing */ + unsigned int sparse_sample_rate; + /* Index owner's nonce */ + uint64_t nonce; +}; + +int __must_check make_configuration(const struct uds_parameters *params, + struct configuration **config_ptr); + +void free_configuration(struct configuration *config); + +int __must_check validate_config_contents(struct buffered_reader *reader, + struct configuration *config); + +int __must_check write_config_contents(struct buffered_writer *writer, + struct configuration *config, + uint32_t version); + +void log_uds_configuration(struct configuration *config); + +#endif /* CONFIG_H */ diff --git a/vdo/constants.c b/vdo/constants.c index b30f6e3f..8bdfb782 100644 --- a/vdo/constants.c +++ b/vdo/constants.c @@ -1,31 +1,15 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/constants.c#3 $ */ #include "types.h" -/** The maximum logical space is 4 petabytes, which is 1 terablock. */ +/* The maximum logical space is 4 petabytes, which is 1 terablock. */ const block_count_t MAXIMUM_VDO_LOGICAL_BLOCKS = 1024ULL * 1024 * 1024 * 1024; -/** The maximum physical space is 256 terabytes, which is 64 gigablocks. */ +/* The maximum physical space is 256 terabytes, which is 64 gigablocks. */ const block_count_t MAXIMUM_VDO_PHYSICAL_BLOCKS = 1024ULL * 1024 * 1024 * 64; -// unit test minimum +/* unit test minimum */ const block_count_t MINIMUM_VDO_SLAB_JOURNAL_BLOCKS = 2; diff --git a/vdo/constants.h b/vdo/constants.h index 1d69251f..e8975d12 100644 --- a/vdo/constants.h +++ b/vdo/constants.h @@ -1,22 +1,6 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/constants.h#5 $ */ #ifndef CONSTANTS_H @@ -27,6 +11,12 @@ #include "types.h" enum { + /** + * The maximum number of contiguous PBNs which will go to a single + * bio submission queue, assuming there is more than one queue. + **/ + VDO_BIO_ROTATION_INTERVAL_LIMIT = 1024, + /** The number of entries on a block map page */ VDO_BLOCK_MAP_ENTRIES_PER_PAGE = 812, @@ -40,6 +30,14 @@ enum { **/ VDO_BLOCK_MAP_TREE_HEIGHT = 5, + /** The default number of bio submission queues. */ + DEFAULT_VDO_BIO_SUBMIT_QUEUE_COUNT = 4, + + /** + * The number of contiguous PBNs to be submitted to a single bio queue. + **/ + DEFAULT_VDO_BIO_SUBMIT_QUEUE_ROTATE_INTERVAL = 64, + /** The number of trees in the arboreal block map */ DEFAULT_VDO_BLOCK_MAP_TREE_ROOT_COUNT = 60, @@ -75,6 +73,11 @@ enum { **/ MAXIMUM_SIMULTANEOUS_VDO_BLOCK_MAP_RESTORATION_READS = 1024, + /** + * The maximum number of total threads in a VDO thread configuration. + **/ + MAXIMUM_VDO_THREADS = 100, + /** The maximum number of VIOs in the system at once */ MAXIMUM_VDO_USER_VIOS = 2048, @@ -95,8 +98,6 @@ enum { /** The size of a sector that will not be torn */ VDO_SECTOR_SIZE = 512, - /** The number of characters needed for a vio operation description */ - VDO_VIO_OPERATION_DESCRIPTION_MAX_LENGTH = 25, /** The physical block number reserved for storing the zero block */ VDO_ZERO_BLOCK = 0, }; @@ -107,7 +108,7 @@ extern const block_count_t MAXIMUM_VDO_LOGICAL_BLOCKS; /** The maximum physical space is 256 terabytes, which is 64 gigablocks. */ extern const block_count_t MAXIMUM_VDO_PHYSICAL_BLOCKS; -// unit test minimum +/** unit test minimum */ extern const block_count_t MINIMUM_VDO_SLAB_JOURNAL_BLOCKS; -#endif // CONSTANTS_H +#endif /* CONSTANTS_H */ diff --git a/uds/cpu.h b/vdo/cpu.h similarity index 60% rename from uds/cpu.h rename to vdo/cpu.h index 6acad943..e4b39c2b 100644 --- a/uds/cpu.h +++ b/vdo/cpu.h @@ -1,29 +1,13 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/krusty/src/uds/cpu.h#8 $ */ #ifndef CPU_H #define CPU_H #include "compiler.h" -#include "typeDefs.h" +#include "type-defs.h" /** * The number of bytes in a CPU cache line. In the future, we'll probably need @@ -32,7 +16,7 @@ * (Must be a \#define since enums are not proper compile-time constants.) **/ #ifdef __PPC__ -// N.B.: Some PPC processors have smaller cache lines. +/* N.B.: Some PPC processors have smaller cache lines. */ #define CACHE_LINE_BYTES 128 #elif defined(__s390x__) #define CACHE_LINE_BYTES 256 @@ -52,11 +36,19 @@ **/ static INLINE void prefetch_address(const void *address, bool for_write) { - // for_write won't won't be a constant if we are compiled with - // optimization turned off, in which case prefetching really doesn't - // matter. + /* + * for_write won't be a constant if we are compiled with optimization + * turned off, in which case prefetching really doesn't matter. + * clang can't figure out that if for_write is a constant, it can be + * passed as the second, mandatorily constant argument to prefetch(), + * at least currently on llvm 12. + */ if (__builtin_constant_p(for_write)) { - __builtin_prefetch(address, for_write); + if (for_write) { + __builtin_prefetch(address, true); + } else { + __builtin_prefetch(address, false); + } } } @@ -72,8 +64,10 @@ static INLINE void prefetch_address(const void *address, bool for_write) static INLINE void prefetch_range(const void *start, unsigned int size, bool for_write) { - // Count the number of cache lines to fetch, allowing for the address - // range to span an extra cache line boundary due to address alignment. + /* + * Count the number of cache lines to fetch, allowing for the address + * range to span an extra cache line boundary due to address alignment. + */ const char *address = (const char *) start; unsigned int offset = ((uintptr_t) address % CACHE_LINE_BYTES); unsigned int cache_lines = (1 + ((size + offset) / CACHE_LINE_BYTES)); diff --git a/vdo/data-vio-pool.c b/vdo/data-vio-pool.c new file mode 100644 index 00000000..27c7d714 --- /dev/null +++ b/vdo/data-vio-pool.c @@ -0,0 +1,828 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright Red Hat + * + */ + +#include "data-vio-pool.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "logger.h" +#include "memory-alloc.h" +#include "permassert.h" + +#include "data-vio.h" +#include "dump.h" +#include "vdo.h" +#include "types.h" + +/** + * DOC: + * + * The data_vio_pool maintains the pool of data_vios which a vdo uses to + * service incoming bios. For correctness, and in order to avoid potentially + * expensive or blocking memory allocations during normal operation, the number + * of concurrently active data_vios is capped. Furthermore, in order to avoid + * starvation of reads and writes, at most 75% of the data_vios may be used for + * discards. The data_vio_pool is responsible for enforcing these + * limits. Threads submitting bios for which a data_vio or discard permit are + * not available will block until the necessary resources are available. The + * pool is also responsible for distributing resources to blocked threads and + * waking them. Finally, the pool attempts to batch the work of recycling + * data_vios by performing the work of actually assigning resources to blocked + * threads or placing data_vios back into the pool on a single cpu at a time. + * + * The pool contains two "limiters", one for tracking data_vios and one for + * tracking discard permits. The limiters also provide safe cross-thread access + * to pool statistics without the need to take the pool's lock. When a thread + * submits a bio to a vdo device, it will first attempt to get a discard permit + * if it is a discard, and then to get a data_vio. If the necessary resources + * are available, the incoming bio will be assigned to the acquired data_vio, + * and it will be launched. However, if either of these are unavailable, the + * arrival time of the bio is recorded in the bio's bi_private field, the bio + * and its submitter are both queued on the appropriate limiter and the + * submitting thread will then put itself to sleep. (note that this mechanism + * will break if jiffies are only 32 bits.) + * + * Whenever a data_vio has completed processing for the bio it was servicing, + * release_data_vio() will be called on it. This function will add the data_vio + * to a funnel queue, and then check the state of the pool. If the pool is not + * currently processing released data_vios, the pool's completion will be + * enqueued on a cpu queue. This obviates the need for the releasing threads to + * hold the pool's lock, and also batches release work while avoiding + * starvation of the cpu threads. + * + * Whenever the pool's completion is run on a cpu thread, it calls + * process_release_callback() which processes a batch of returned data_vios + * (currently at most 32) from the pool's funnel queue. For each data_vio, it + * first checks whether that data_vio was processing a discard. If so, and + * there is a blocked bio waiting for a discard permit, that permit is + * notionally transfered to the eldest discard waiter, and that waiter is moved + * to the end of the list of discard bios waiting for a data_vio. If there are + * no discard waiters, the discard permit is returned to the pool. Next, the + * data_vio is assigned to the oldest blocked bio which either has a discard + * permit, or doesn't need one and relaunched. If neither of these exist, the + * data_vio is returned to the pool. Finally, if any waiting bios were + * launched, the threads which blocked trying to submit them are awakened. + */ + +enum { + DATA_VIO_RELEASE_BATCH_SIZE = 128, +}; + +static const unsigned int VDO_SECTORS_PER_BLOCK_MASK = + VDO_SECTORS_PER_BLOCK - 1; + +struct limiter; +typedef void assigner(struct limiter *limiter); + +/* + * Bookkeeping structure for a single type of resource. + */ +struct limiter { + /* The data_vio_pool to which this limiter belongs */ + struct data_vio_pool *pool; + /* The maximum number of data_vios available */ + vio_count_t limit; + /* The number of resources in use */ + vio_count_t busy; + /* The maximum number of resources ever simultaneously in use */ + vio_count_t max_busy; + /* The number of resources to release */ + vio_count_t release_count; + /* The number of waiters to wake */ + vio_count_t wake_count; + /* + * The list of waiting bios which are known to + * process_release_callback() + */ + struct bio_list waiters; + /* + * The list of waiting bios which are not yet known to + * process_release_callback() + */ + struct bio_list new_waiters; + /* The list of waiters which have their permits */ + struct bio_list *permitted_waiters; + /* The function for assigning a resource to a waiter */ + assigner *assigner; + /* The queue of blocked threads */ + wait_queue_head_t blocked_threads; + /* The arrival time of the eldest waiter */ + uint64_t arrival; +}; + +/* + * A data_vio_pool is a collection of preallocated data_vios which may be + * acquired from any thread, and are released in batches. + */ +struct data_vio_pool { + /* Completion for scheduling releases */ + struct vdo_completion completion; + /* The administrative state of the pool */ + struct admin_state state; + /* Lock protecting the pool */ + spinlock_t lock; + /* The main limiter controlling the total data_vios in the pool. */ + struct limiter limiter; + /* The limiter controlling data_vios for discard */ + struct limiter discard_limiter; + /* + * The list of bios which have discard permits but still need a + * data_vio + */ + struct bio_list permitted_discards; + /* The list of available data_vios */ + struct list_head available; + /* The queue of data_vios waiting to be returned to the pool */ + struct funnel_queue *queue; + /* Whether the pool is processing, or scheduled to process releases */ + atomic_t processing; + /* The data vios in the pool */ + struct data_vio data_vios[]; +}; + +/** + * as_data_vio_pool() - Convert a vdo_completion to a data_vio_pool. + * @completion: The completion to convert. + * + * Return: The completion as a data_vio_pool. + */ +static inline struct data_vio_pool * __must_check +as_data_vio_pool(struct vdo_completion *completion) +{ + vdo_assert_completion_type(completion->type, + VDO_DATA_VIO_POOL_COMPLETION); + return container_of(completion, + struct data_vio_pool, + completion); +} + +static inline uint64_t get_arrival_time(struct bio *bio) +{ + return (uint64_t) bio->bi_private; +} + +/** + * check_for_drain_complete_locked() - Check whether a data_vio_pool + * has no outstanding data_vios or + * waiters while holding the + * pool's lock. + * @pool: The pool to check. + * + * Return: true if the pool has no busy data_vios or waiters. + */ +static bool check_for_drain_complete_locked(struct data_vio_pool *pool) +{ + if (pool->limiter.busy > 0) { + return false; + } + + ASSERT_LOG_ONLY((pool->discard_limiter.busy == 0), + "no outstanding discard permits"); + + return (bio_list_empty(&pool->limiter.new_waiters) + && bio_list_empty(&pool->discard_limiter.new_waiters)); +} + +/* + * Reset a data_vio which has just been acquired from the pool. + */ +static void reset_data_vio(struct data_vio *data_vio, struct vdo *vdo) +{ + struct vio *vio = data_vio_as_vio(data_vio); + /* + * FIXME: We save the bio out of the vio so that we don't forget it. + * Maybe we should just not zero that field somehow. + */ + struct bio *bio = vio->bio; + + /* + * Zero out the fields which don't need to be preserved (i.e. which + * are not pointers to separately allocated objects). + */ + memset(data_vio, 0, offsetof(struct data_vio, dedupe_context)); + memset(&data_vio->compression, 0, offsetof(struct compression_state, + block)); + memset(&data_vio->dedupe_context.pending_list, 0, + sizeof(struct list_head)); + + initialize_vio(vio, + bio, + 1, + VIO_TYPE_DATA, + VIO_PRIORITY_DATA, + vdo); +} + +static void launch_bio(struct vdo *vdo, + struct data_vio *data_vio, + struct bio *bio) +{ + enum data_vio_operation operation = DATA_VIO_WRITE; + logical_block_number_t lbn; + reset_data_vio(data_vio, vdo); + data_vio->user_bio = bio; + data_vio->offset = to_bytes(bio->bi_iter.bi_sector + & VDO_SECTORS_PER_BLOCK_MASK); + data_vio->is_partial = ((bio->bi_iter.bi_size < VDO_BLOCK_SIZE) || + (data_vio->offset != 0)); + + /* + * Discards behave very differently than other requests when coming in + * from device-mapper. We have to be able to handle any size discards + * and various sector offsets within a block. + */ + if (bio_op(bio) == REQ_OP_DISCARD) { + data_vio->remaining_discard = bio->bi_iter.bi_size; + if (data_vio->is_partial) { + vdo_count_bios(&vdo->stats.bios_in_partial, bio); + operation = DATA_VIO_READ_MODIFY_WRITE; + } + } else if (data_vio->is_partial) { + vdo_count_bios(&vdo->stats.bios_in_partial, bio); + operation = ((bio_data_dir(bio) == READ) + ? DATA_VIO_READ : DATA_VIO_READ_MODIFY_WRITE); + } else if (bio_data_dir(bio) == READ) { + operation = DATA_VIO_READ; + } else { + /* + * Copy the bio data to a char array so that we can continue to + * use the data after we acknowledge the bio. + */ + vdo_bio_copy_data_in(bio, data_vio->data_block); + data_vio->is_zero_block = is_zero_block(data_vio->data_block); + } + + if (data_vio->user_bio->bi_opf & REQ_FUA) { + operation |= DATA_VIO_FUA; + } + + lbn = ((bio->bi_iter.bi_sector - vdo->starting_sector_offset) + / VDO_SECTORS_PER_BLOCK); + launch_data_vio(data_vio, lbn, operation); +} + +static void assign_data_vio(struct limiter *limiter, struct data_vio *data_vio) +{ + struct bio *bio = bio_list_pop(limiter->permitted_waiters); + + launch_bio(limiter->pool->completion.vdo, data_vio, bio); + limiter->wake_count++; + + bio = bio_list_peek(limiter->permitted_waiters); + limiter->arrival = ((bio == NULL) ? UINT64_MAX : get_arrival_time(bio)); +} + +static void assign_discard_permit(struct limiter *limiter) +{ + struct bio *bio = bio_list_pop(&limiter->waiters); + + if (limiter->arrival == UINT64_MAX) { + limiter->arrival = get_arrival_time(bio); + } + + bio_list_add(limiter->permitted_waiters, bio); +} + +static void get_waiters(struct limiter *limiter) +{ + bio_list_merge(&limiter->waiters, &limiter->new_waiters); + bio_list_init(&limiter->new_waiters); +} + +static inline +struct data_vio *get_available_data_vio(struct data_vio_pool *pool) +{ + struct data_vio *data_vio = list_first_entry(&pool->available, + struct data_vio, + pool_entry); + list_del_init(&data_vio->pool_entry); + return data_vio; +} + +static void assign_data_vio_to_waiter(struct limiter *limiter) +{ + assign_data_vio(limiter, get_available_data_vio(limiter->pool)); +} + +static void update_limiter(struct limiter *limiter) +{ + struct bio_list *waiters = &limiter->waiters; + vio_count_t available = limiter->limit - limiter->busy; + + ASSERT_LOG_ONLY((limiter->release_count <= limiter->busy), + "Release count %u is not more than busy count %u", + limiter->release_count, + limiter->busy); + + get_waiters(limiter); + for (; (limiter->release_count > 0) && !bio_list_empty(waiters); + limiter->release_count--) { + limiter->assigner(limiter); + } + + if (limiter->release_count > 0) { + WRITE_ONCE(limiter->busy, + limiter->busy - limiter->release_count); + limiter->release_count = 0; + return; + } + + for (; (available > 0) && !bio_list_empty(waiters); available--) { + limiter->assigner(limiter); + } + + WRITE_ONCE(limiter->busy, limiter->limit - available); + if (limiter->max_busy < limiter->busy) { + WRITE_ONCE(limiter->max_busy, limiter->busy); + } +} + +/** + * schedule_releases() - Ensure that release processing is scheduled. + * @pool: The data_vio_pool which has resources to release. + * + * If this call switches the state to processing, enqueue. Otherwise, some + * other thread has already done so. + */ +static void schedule_releases(struct data_vio_pool *pool) +{ + /* Pairs with the barrier in process_release_callback(). */ + smp_mb__before_atomic(); + if (atomic_cmpxchg(&pool->processing, false, true)) { + return; + } + + pool->completion.requeue = true; + vdo_invoke_completion_callback_with_priority(&pool->completion, + CPU_Q_COMPLETE_VIO_PRIORITY); +} + +static void reuse_or_release_resources(struct data_vio_pool *pool, + struct data_vio *data_vio, + struct list_head *returned) +{ + if (data_vio->remaining_discard > 0) { + if (bio_list_empty(&pool->discard_limiter.waiters)) { + /* Return the data_vio's discard permit. */ + pool->discard_limiter.release_count++; + } else { + assign_discard_permit(&pool->discard_limiter); + } + } + + if (pool->limiter.arrival < pool->discard_limiter.arrival) { + assign_data_vio(&pool->limiter, data_vio); + } else if (pool->discard_limiter.arrival < UINT64_MAX) { + assign_data_vio(&pool->discard_limiter, data_vio); + } else { + list_add(&data_vio->pool_entry, returned); + pool->limiter.release_count++; + } +} + +/** + * process_release_callback() - Process a batch of data_vio releases. + * @completion: The pool with data_vios to release. + */ +static void process_release_callback(struct vdo_completion *completion) +{ + struct data_vio_pool *pool = as_data_vio_pool(completion); + bool reschedule; + bool drained; + vio_count_t processed; + vio_count_t to_wake; + vio_count_t discards_to_wake; + LIST_HEAD(returned); + + spin_lock(&pool->lock); + get_waiters(&pool->discard_limiter); + get_waiters(&pool->limiter); + spin_unlock(&pool->lock); + + if (pool->limiter.arrival == UINT64_MAX) { + struct bio *bio = bio_list_peek(&pool->limiter.waiters); + + if (bio != NULL) { + pool->limiter.arrival = get_arrival_time(bio); + } + } + + for (processed = 0; + processed < DATA_VIO_RELEASE_BATCH_SIZE; + processed++) { + struct data_vio *data_vio; + struct funnel_queue_entry *entry + = funnel_queue_poll(pool->queue); + + if (entry == NULL) { + break; + } + + data_vio = data_vio_from_funnel_queue_entry(entry); + acknowledge_data_vio(data_vio); + reuse_or_release_resources(pool, data_vio, &returned); + } + + spin_lock(&pool->lock); + /* + * There is a race where waiters could be added while we are in the + * unlocked section above. Those waiters could not see the resources we + * are now about to release, so we assign those resources now as we + * have no guarantee of being rescheduled. This is handled in + * update_limiter(). + */ + update_limiter(&pool->discard_limiter); + list_splice(&returned, &pool->available); + update_limiter(&pool->limiter); + to_wake = pool->limiter.wake_count; + pool->limiter.wake_count = 0; + discards_to_wake = pool->discard_limiter.wake_count; + pool->discard_limiter.wake_count = 0; + + atomic_set(&pool->processing, false); + /* Pairs with the barrier in schedule_releases(). */ + smp_mb(); + + reschedule = !is_funnel_queue_empty(pool->queue); + drained = (!reschedule + && vdo_is_state_draining(&pool->state) + && check_for_drain_complete_locked(pool)); + spin_unlock(&pool->lock); + + if (to_wake > 0) { + wake_up_nr(&pool->limiter.blocked_threads, to_wake); + } + + if (discards_to_wake > 0) { + wake_up_nr(&pool->discard_limiter.blocked_threads, + discards_to_wake); + } + + if (reschedule) { + schedule_releases(pool); + } else if (drained) { + vdo_finish_draining(&pool->state); + } +} + +static void initialize_limiter(struct limiter *limiter, + struct data_vio_pool *pool, + assigner *assigner, + vio_count_t limit) +{ + limiter->pool = pool; + limiter->assigner = assigner; + limiter->limit = limit; + limiter->arrival = UINT64_MAX; + init_waitqueue_head(&limiter->blocked_threads); +} + +/** + * make_data_vio_pool() - Initialize a data_vio pool. + * @vdo: The vdo to which the pool will belong. + * @pool_size: The number of data_vios in the pool. + * @discard_limit: The maximum number of data_vios which may be used for + * discards. + * @pool: A pointer to hold the newly allocated pool. + */ +int make_data_vio_pool(struct vdo *vdo, + vio_count_t pool_size, + vio_count_t discard_limit, + struct data_vio_pool **pool_ptr) +{ + int result; + struct data_vio_pool *pool; + vio_count_t i; + + result = UDS_ALLOCATE_EXTENDED(struct data_vio_pool, + pool_size, + struct data_vio, + __func__, + &pool); + if (result != UDS_SUCCESS) { + return result; + } + + ASSERT_LOG_ONLY((discard_limit <= pool_size), + "discard limit does not exceed pool size"); + initialize_limiter(&pool->discard_limiter, + pool, + assign_discard_permit, + discard_limit); + pool->discard_limiter.permitted_waiters = &pool->permitted_discards; + initialize_limiter(&pool->limiter, + pool, + assign_data_vio_to_waiter, + pool_size); + pool->limiter.permitted_waiters = &pool->limiter.waiters; + INIT_LIST_HEAD(&pool->available); + spin_lock_init(&pool->lock); + vdo_set_admin_state_code(&pool->state, + VDO_ADMIN_STATE_NORMAL_OPERATION); + vdo_initialize_completion(&pool->completion, + vdo, + VDO_DATA_VIO_POOL_COMPLETION); + vdo_prepare_completion(&pool->completion, + process_release_callback, + process_release_callback, + vdo->thread_config->cpu_thread, + NULL); + + result = make_funnel_queue(&pool->queue); + if (result != UDS_SUCCESS) { + free_data_vio_pool(UDS_FORGET(pool)); + return result; + } + + for (i = 0; i < pool_size; i++) { + struct data_vio *data_vio = &pool->data_vios[i]; + + result = initialize_data_vio(data_vio); + if (result != VDO_SUCCESS) { + free_data_vio_pool(pool); + return result; + } + + list_add(&data_vio->pool_entry, &pool->available); + } + + *pool_ptr = pool; + return VDO_SUCCESS; +} + +/** + * free_data_vio_pool() - Free a data_vio_pool and the data_vios in it. + * @pool: The pool to free (may be NULL). + * + * All data_vios must be returned to the pool before calling this function. + */ +void free_data_vio_pool(struct data_vio_pool *pool) +{ + if (pool == NULL) { + return; + } + + /* + * Pairs with the barrier in process_release_callback(). Possibly not + * needed since it caters to an enqueue vs. free race. + */ + smp_mb(); + BUG_ON(atomic_read(&pool->processing)); + + spin_lock(&pool->lock); + ASSERT_LOG_ONLY((pool->limiter.busy == 0), + "data_vio pool must not have %u busy entries when being freed", + pool->limiter.busy); + ASSERT_LOG_ONLY((bio_list_empty(&pool->limiter.waiters) + && bio_list_empty(&pool->limiter.new_waiters)), + "data_vio pool must not have threads waiting to read or write when being freed"); + ASSERT_LOG_ONLY((bio_list_empty(&pool->discard_limiter.waiters) + && bio_list_empty(&pool->discard_limiter.new_waiters)), + "data_vio pool must not have threads waiting to discard when being freed"); + spin_unlock(&pool->lock); + + while (!list_empty(&pool->available)) { + struct data_vio *data_vio = list_first_entry(&pool->available, + struct data_vio, + pool_entry); + + list_del_init(pool->available.next); + destroy_data_vio(data_vio); + } + + free_funnel_queue(UDS_FORGET(pool->queue)); + UDS_FREE(pool); +} + +static bool acquire_permit(struct limiter *limiter, struct bio *bio) +{ + if (limiter->busy >= limiter->limit) { + DEFINE_WAIT(wait); + + bio_list_add(&limiter->new_waiters, bio); + prepare_to_wait_exclusive(&limiter->blocked_threads, + &wait, + TASK_UNINTERRUPTIBLE); + spin_unlock(&limiter->pool->lock); + io_schedule(); + finish_wait(&limiter->blocked_threads, &wait); + return false; + } + + WRITE_ONCE(limiter->busy, limiter->busy + 1); + if (limiter->max_busy < limiter->busy) { + WRITE_ONCE(limiter->max_busy, limiter->busy); + } + + return true; +} + +/** + * vdo_launch_bio() - Acquire a data_vio from the pool, assign the bio to it, + * and send it on its way. + * @pool: The pool from which to acquire a data_vio. + * @bio: The bio to launch. + * + * This will block if data_vios or discard permits are not available. + */ +void vdo_launch_bio(struct data_vio_pool *pool, struct bio *bio) +{ + struct data_vio *data_vio; + + ASSERT_LOG_ONLY(!vdo_is_state_quiescent(&pool->state), + "data_vio_pool not quiescent on acquire"); + + bio->bi_private = (void *) jiffies; + spin_lock(&pool->lock); + if ((bio_op(bio) == REQ_OP_DISCARD) && + !acquire_permit(&pool->discard_limiter, bio)) { + return; + } + + if (!acquire_permit(&pool->limiter, bio)) { + return; + } + + data_vio = get_available_data_vio(pool); + spin_unlock(&pool->lock); + launch_bio(pool->completion.vdo, data_vio, bio); +} + +/** + * release_data_vio() - Return a data_vio to the pool. + * @data_vio: The data_vio to return. + */ +void release_data_vio(struct data_vio *data_vio) +{ + struct data_vio_pool *pool = + vdo_from_data_vio(data_vio)->data_vio_pool; + + funnel_queue_put(pool->queue, + &(data_vio_as_completion(data_vio)->work_queue_entry_link)); + schedule_releases(pool); +} + +/** + * initiate_drain() - Initiate a drain. + * + * Implements vdo_admin_initiator. + */ +static void initiate_drain(struct admin_state *state) +{ + bool drained; + struct data_vio_pool *pool = container_of(state, + struct data_vio_pool, + state); + + spin_lock(&pool->lock); + drained = check_for_drain_complete_locked(pool); + spin_unlock(&pool->lock); + + if (drained) { + vdo_finish_draining(state); + } +} + +/** + * drain_data_vio_pool() - Wait asynchronously for all data_vios to be + * returned to the pool. + * @pool: The data_vio_pool to drain. + * @completion: The completion to notify when the pool has drained. + */ +void drain_data_vio_pool(struct data_vio_pool *pool, + struct vdo_completion *completion) +{ + assert_on_vdo_cpu_thread(completion->vdo, __func__); + vdo_start_draining(&pool->state, + VDO_ADMIN_STATE_SUSPENDING, + completion, + initiate_drain); +} + +/** + * resume_data_vio_pool() - Resume a data_vio pool. + * @pool: The pool to resume. + * @completion: The completion to notify when the pool has resumed. + */ +void resume_data_vio_pool(struct data_vio_pool *pool, + struct vdo_completion *completion) +{ + assert_on_vdo_cpu_thread(completion->vdo, __func__); + vdo_finish_completion(completion, + vdo_resume_if_quiescent(&pool->state)); +} + +static void dump_limiter(const char *name, struct limiter *limiter) +{ + uds_log_info("%s: %u of %u busy (max %u), %s", + name, + limiter->busy, + limiter->limit, + limiter->max_busy, + ((bio_list_empty(&limiter->waiters) + && bio_list_empty(&limiter->new_waiters)) + ? "no waiters" + : "has waiters")); +} + +/** + * dump_data_vio_pool() - Dump a data_vio pool to the log. + * @pool: The pool to dump. + * @dump_vios: Whether to dump the details of each busy data_vio as well. + */ +void dump_data_vio_pool(struct data_vio_pool *pool, bool dump_vios) +{ + /* + * In order that syslog can empty its buffer, sleep after 35 elements + * for 4ms (till the second clock tick). These numbers were picked + * based on experiments with lab machines. + */ + enum { ELEMENTS_PER_BATCH = 35 }; + enum { SLEEP_FOR_SYSLOG = 4000 }; + + if (pool == NULL) { + return; + } + + spin_lock(&pool->lock); + dump_limiter("data_vios", &pool->limiter); + dump_limiter("discard permits", &pool->discard_limiter); + if (dump_vios) { + int i; + int dumped = 0; + + for (i = 0; i < pool->limiter.limit; i++) { + struct data_vio *data_vio = &pool->data_vios[i]; + + if (!list_empty(&data_vio->pool_entry)) { + continue; + } + + dump_data_vio(data_vio); + if (++dumped >= ELEMENTS_PER_BATCH) { + spin_unlock(&pool->lock); + dumped = 0; + fsleep(SLEEP_FOR_SYSLOG); + spin_lock(&pool->lock); + } + } + } + + spin_unlock(&pool->lock); +} + +vio_count_t get_data_vio_pool_active_discards(struct data_vio_pool *pool) +{ + return READ_ONCE(pool->discard_limiter.busy); +} + +vio_count_t get_data_vio_pool_discard_limit(struct data_vio_pool *pool) +{ + return READ_ONCE(pool->discard_limiter.limit); +} + +vio_count_t get_data_vio_pool_maximum_discards(struct data_vio_pool *pool) +{ + return READ_ONCE(pool->discard_limiter.max_busy); +} + +int set_data_vio_pool_discard_limit(struct data_vio_pool *pool, + vio_count_t limit) +{ + if (get_data_vio_pool_request_limit(pool) < limit) { + // The discard limit may not be higher than the data_vio limit. + return -EINVAL; + } + + spin_lock(&pool->lock); + pool->discard_limiter.limit = limit; + spin_unlock(&pool->lock); + + return VDO_SUCCESS; +} + +vio_count_t get_data_vio_pool_active_requests(struct data_vio_pool *pool) +{ + return READ_ONCE(pool->limiter.busy); +} + +vio_count_t get_data_vio_pool_request_limit(struct data_vio_pool *pool) +{ + return READ_ONCE(pool->limiter.limit); +} + +vio_count_t get_data_vio_pool_maximum_requests(struct data_vio_pool *pool) +{ + return READ_ONCE(pool->limiter.max_busy); +} diff --git a/vdo/data-vio-pool.h b/vdo/data-vio-pool.h new file mode 100644 index 00000000..fbd047f3 --- /dev/null +++ b/vdo/data-vio-pool.h @@ -0,0 +1,43 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright Red Hat + * + */ + +#ifndef DATA_VIO_POOL_H +#define DATA_VIO_POOL_H + +#include + +#include "kernel-types.h" +#include "types.h" + +int make_data_vio_pool(struct vdo *vdo, + vio_count_t pool_size, + vio_count_t discard_limit, + struct data_vio_pool **pool_ptr); + +void free_data_vio_pool(struct data_vio_pool *pool); + +void vdo_launch_bio(struct data_vio_pool *pool, struct bio *bio); + +void release_data_vio(struct data_vio *data_vio); + +void drain_data_vio_pool(struct data_vio_pool *pool, + struct vdo_completion *completion); + +void resume_data_vio_pool(struct data_vio_pool *pool, + struct vdo_completion *completion); + +void dump_data_vio_pool(struct data_vio_pool *pool, bool dump_vios); + +vio_count_t get_data_vio_pool_active_discards(struct data_vio_pool *pool); +vio_count_t get_data_vio_pool_discard_limit(struct data_vio_pool *pool); +vio_count_t get_data_vio_pool_maximum_discards(struct data_vio_pool *pool); +int __must_check set_data_vio_pool_discard_limit(struct data_vio_pool *pool, + vio_count_t limit); +vio_count_t get_data_vio_pool_active_requests(struct data_vio_pool *pool); +vio_count_t get_data_vio_pool_request_limit(struct data_vio_pool *pool); +vio_count_t get_data_vio_pool_maximum_requests(struct data_vio_pool *pool); + +#endif // DATA_VIO_POOL_H diff --git a/vdo/data-vio.c b/vdo/data-vio.c new file mode 100644 index 00000000..3f56254f --- /dev/null +++ b/vdo/data-vio.c @@ -0,0 +1,681 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright Red Hat + */ + +#include "data-vio.h" + +#include + +#include "memory-alloc.h" +#include "permassert.h" + +#include "allocation-selector.h" +#include "bio.h" +#include "block-allocator.h" +#include "block-map.h" +#include "compressed-block.h" +#include "compression-state.h" +#include "dump.h" +#include "int-map.h" +#include "logical-zone.h" +#include "packer.h" +#include "status-codes.h" +#include "vdo.h" +#include "vdo-component.h" +#include "vdo-component-states.h" +#include "vio-read.h" +#include "vio-write.h" + +static const char *ASYNC_OPERATION_NAMES[] = { + "launch", + "acknowledge_write", + "acquire_hash_lock", + "attempt_logical_block_lock", + "lock_duplicate_pbn", + "check_for_duplication", + "cleanup", + "compress_data_vio", + "find_block_map_slot", + "get_mapped_block/for_read", + "get_mapped_block/for_dedupe", + "get_mapped_block/for_write", + "hash_data_vio", + "journal_decrement_for_dedupe", + "journal_decrement_for_write", + "journal_increment_for_compression", + "journal_increment_for_dedupe", + "journal_increment_for_write", + "journal_mapping_for_compression", + "journal_mapping_for_dedupe", + "journal_mapping_for_write", + "journal_unmapping_for_dedupe", + "journal_unmapping_for_write", + "vdo_attempt_packing", + "put_mapped_block/for_write", + "put_mapped_block/for_dedupe", + "read_data_vio", + "update_dedupe_index", + "verify_duplication", + "write_data_vio", +}; + +void destroy_data_vio(struct data_vio *data_vio) +{ + if (data_vio == NULL) { + return; + } + + vdo_free_bio(UDS_FORGET(data_vio_as_vio(data_vio)->bio)); + UDS_FREE(UDS_FORGET(data_vio->compression.block)); + UDS_FREE(UDS_FORGET(data_vio->data_block)); + UDS_FREE(UDS_FORGET(data_vio->scratch_block)); +} + +/** + * allocate_data_vio_components() - Allocate the components of a data_vio. + * @data_vio: The data_vio being constructed. + * + * Return: VDO_SUCCESS or an error. + */ +static int __must_check allocate_data_vio_components(struct data_vio *data_vio) +{ + struct vio *vio; + int result; + + STATIC_ASSERT(VDO_BLOCK_SIZE <= PAGE_SIZE); + result = uds_allocate_memory(VDO_BLOCK_SIZE, 0, "vio data", + &data_vio->data_block); + if (result != VDO_SUCCESS) { + return uds_log_error_strerror(result, + "data_vio data allocation failure"); + } + + vio = data_vio_as_vio(data_vio); + result = vdo_create_bio(&vio->bio); + if (result != VDO_SUCCESS) { + return uds_log_error_strerror(result, + "data_vio data bio allocation failure"); + } + + result = uds_allocate_memory(VDO_BLOCK_SIZE, + 0, + "compressed block", + &data_vio->compression.block); + if (result != VDO_SUCCESS) { + return uds_log_error_strerror(result, + "data_vio compressed block allocation failure"); + } + + result = uds_allocate_memory(VDO_BLOCK_SIZE, 0, "vio scratch", + &data_vio->scratch_block); + if (result != VDO_SUCCESS) { + return uds_log_error_strerror(result, + "data_vio scratch allocation failure"); + } + + return VDO_SUCCESS; +} + +int initialize_data_vio(struct data_vio *data_vio) +{ + int result = allocate_data_vio_components(data_vio); + + if (result != VDO_SUCCESS) { + destroy_data_vio(data_vio); + } + + return result; +} + +/** + * initialize_lbn_lock() - Initialize the LBN lock of a data_vio. + * @data_vio: The data_vio to initialize. + * @lbn: The lbn on which the data_vio will operate. + * + * In addition to recording the LBN on which the data_vio will operate, it + * will also find the logical zone associated with the LBN. + */ +static void initialize_lbn_lock(struct data_vio *data_vio, + logical_block_number_t lbn) +{ + struct vdo *vdo = vdo_from_data_vio(data_vio); + zone_count_t zone_number; + struct lbn_lock *lock = &data_vio->logical; + + lock->lbn = lbn; + lock->locked = false; + initialize_wait_queue(&lock->waiters); + zone_number = vdo_compute_logical_zone(data_vio); + lock->zone = &vdo->logical_zones->zones[zone_number]; +} + +void attempt_logical_block_lock(struct vdo_completion *completion); + +/** + * launch_data_vio() - (Re)initialize a data_vio to have a new logical + * block number, keeping the same parent and other + * state and send it on its way. + * @data_vio: The data_vio to initialize. + * @lbn: The logical block number of the data_vio. + * @operation: The operation this data_vio will perform. + */ +void launch_data_vio(struct data_vio *data_vio, + logical_block_number_t lbn, + enum data_vio_operation operation) +{ + struct vio *vio = data_vio_as_vio(data_vio); + struct vdo_completion *completion = vio_as_completion(vio); + + /* + * Clearing the tree lock must happen before initializing the LBN lock, + * which also adds information to the tree lock. + */ + memset(&data_vio->tree_lock, 0, sizeof(data_vio->tree_lock)); + initialize_lbn_lock(data_vio, lbn); + INIT_LIST_HEAD(&data_vio->hash_lock_entry); + INIT_LIST_HEAD(&data_vio->write_entry); + + memset(&data_vio->allocation, 0, sizeof(data_vio->allocation)); + + data_vio->is_duplicate = false; + + memset(&data_vio->chunk_name, 0, sizeof(data_vio->chunk_name)); + memset(&data_vio->duplicate, 0, sizeof(data_vio->duplicate)); + + data_vio->io_operation = operation; + data_vio->mapped.state = VDO_MAPPING_STATE_UNCOMPRESSED; + if (data_vio->is_partial || (data_vio->remaining_discard == 0)) { + /* This is either a write or a partial block discard */ + data_vio->new_mapped.state = VDO_MAPPING_STATE_UNCOMPRESSED; + } else { + /* This is a full block discard */ + data_vio->new_mapped.state = VDO_MAPPING_STATE_UNMAPPED; + } + + vdo_reset_completion(completion); + set_data_vio_logical_callback(data_vio, attempt_logical_block_lock); + vdo_invoke_completion_callback_with_priority(completion, + VDO_DEFAULT_Q_MAP_BIO_PRIORITY); +} + +static void update_data_vio_error_stats(struct data_vio *data_vio) +{ + static const char *operations[] = { + [DATA_VIO_UNSPECIFIED_OPERATION] = "empty", + [DATA_VIO_READ] = "read", + [DATA_VIO_WRITE] = "write", + [DATA_VIO_READ_MODIFY_WRITE] = "read-modify-write", + [DATA_VIO_READ | DATA_VIO_FUA] = "read+fua", + [DATA_VIO_WRITE | DATA_VIO_FUA] = "write+fua", + [DATA_VIO_READ_MODIFY_WRITE | DATA_VIO_FUA] = + "read-modify-write+fua", + }; + + update_vio_error_stats(data_vio_as_vio(data_vio), + "Completing %s vio for LBN %llu with error after %s", + operations[data_vio->io_operation], + (unsigned long long) data_vio->logical.lbn, + get_data_vio_operation_name(data_vio)); +} + +/** + * complete_data_vio() - Complete the processing of a data_vio. + * @completion: The completion of the vio to complete. + */ +void complete_data_vio(struct vdo_completion *completion) +{ + struct data_vio *data_vio = as_data_vio(completion); + + completion->error_handler = NULL; + if (completion->result != VDO_SUCCESS) { + update_data_vio_error_stats(data_vio); + } + + data_vio->last_async_operation = VIO_ASYNC_OP_CLEANUP; + if (is_read_data_vio(data_vio)) { + cleanup_read_data_vio(data_vio); + } else { + cleanup_write_data_vio(data_vio); + } +} + +/** + * finish_data_vio() - Finish processing a data_vio. + * @data_vio: The data_vio. + * @result: The result of processing the data_vio. + * + * This function will set any error, and then initiate data_vio clean up. + */ +void finish_data_vio(struct data_vio *data_vio, int result) +{ + struct vdo_completion *completion = data_vio_as_completion(data_vio); + + vdo_set_completion_result(completion, result); + complete_data_vio(completion); +} + +/** + * get_data_vio_operation_name() - Get the name of the last asynchronous + * operation performed on a data_vio. + * @data_vio: The data_vio in question. + * + * Return: The name of the last operation performed on the data_vio. + */ +const char *get_data_vio_operation_name(struct data_vio *data_vio) +{ + STATIC_ASSERT((MAX_VIO_ASYNC_OPERATION_NUMBER - + MIN_VIO_ASYNC_OPERATION_NUMBER) == + ARRAY_SIZE(ASYNC_OPERATION_NAMES)); + + return ((data_vio->last_async_operation < + MAX_VIO_ASYNC_OPERATION_NUMBER) ? + ASYNC_OPERATION_NAMES[data_vio->last_async_operation] : + "unknown async operation"); +} + +/** + * set_data_vio_duplicate_location() - Set the location of the duplicate block + * for a data_vio, updating the + * is_duplicate and duplicate fields from + * a zoned_pbn. + * @data_vio: The data_vio to modify. + * @source: The location of the duplicate. + */ +void set_data_vio_duplicate_location(struct data_vio *data_vio, + const struct zoned_pbn source) +{ + data_vio->is_duplicate = (source.pbn != VDO_ZERO_BLOCK); + data_vio->duplicate = source; +} + +/** + * clear_data_vio_mapped_location() - Clear a data_vio's mapped block + * location, setting it to be unmapped. + * @data_vio: The data_vio whose mapped block location is to be reset. + * + * This indicates the block map entry for the logical block is either unmapped + * or corrupted. + */ +void clear_data_vio_mapped_location(struct data_vio *data_vio) +{ + data_vio->mapped = (struct zoned_pbn){ + .state = VDO_MAPPING_STATE_UNMAPPED, + }; +} + +/** + * set_data_vio_mapped_location() - Set a data_vio's mapped field to the + * physical location recorded in the block + * map for the logical block in the vio. + * @data_vio: The data_vio whose field is to be set. + * @pbn: The physical block number to set. + * @state: The mapping state to set. + * + * Return: VDO_SUCCESS or an error code if the mapping is unusable. + */ +int set_data_vio_mapped_location(struct data_vio *data_vio, + physical_block_number_t pbn, + enum block_mapping_state state) +{ + struct physical_zone *zone; + int result = vdo_get_physical_zone(vdo_from_data_vio(data_vio), + pbn, &zone); + if (result != VDO_SUCCESS) { + return result; + } + + data_vio->mapped = (struct zoned_pbn){ + .pbn = pbn, + .state = state, + .zone = zone, + }; + return VDO_SUCCESS; +} + +/** + * launch_locked_request() - Launch a request which has acquired an LBN lock. + * @data_vio: The data_vio which has just acquired a lock. + */ +static void launch_locked_request(struct data_vio *data_vio) +{ + data_vio->logical.locked = true; + + if (is_write_data_vio(data_vio)) { + launch_write_data_vio(data_vio); + } else { + launch_read_data_vio(data_vio); + } +} + +/** + * attempt_logical_block_lock() - Attempt to acquire the lock on a logical + * block. + * @completion: The data_vio for an external data request as a completion. + * + * This is the start of the path for all external requests. It is registered + * in launch_data_vio(). + */ +void attempt_logical_block_lock(struct vdo_completion *completion) +{ + struct data_vio *data_vio = as_data_vio(completion); + struct lbn_lock *lock = &data_vio->logical; + struct vdo *vdo = vdo_from_data_vio(data_vio); + struct data_vio *lock_holder; + int result; + + assert_data_vio_in_logical_zone(data_vio); + + if (data_vio->logical.lbn >= vdo->states.vdo.config.logical_blocks) { + finish_data_vio(data_vio, VDO_OUT_OF_RANGE); + return; + } + + result = int_map_put(lock->zone->lbn_operations, + lock->lbn, + data_vio, + false, + (void **) &lock_holder); + if (result != VDO_SUCCESS) { + finish_data_vio(data_vio, result); + return; + } + + if (lock_holder == NULL) { + /* We got the lock */ + launch_locked_request(data_vio); + return; + } + + result = ASSERT(lock_holder->logical.locked, + "logical block lock held"); + if (result != VDO_SUCCESS) { + finish_data_vio(data_vio, result); + return; + } + + /* + * If the new request is a pure read request (not read-modify-write) + * and the lock_holder is writing and has received an allocation + * (VDO-2683), service the read request immediately by copying data + * from the lock_holder to avoid having to flush the write out of the + * packer just to prevent the read from waiting indefinitely. If the + * lock_holder does not yet have an allocation, prevent it from + * blocking in the packer and wait on it. + */ + if (is_read_data_vio(data_vio) && + READ_ONCE(lock_holder->allocation_succeeded)) { + vdo_bio_copy_data_out(data_vio->user_bio, + (lock_holder->data_block + + data_vio->offset)); + acknowledge_data_vio(data_vio); + complete_data_vio(completion); + return; + } + + data_vio->last_async_operation = VIO_ASYNC_OP_ATTEMPT_LOGICAL_BLOCK_LOCK; + result = enqueue_data_vio(&lock_holder->logical.waiters, + data_vio); + if (result != VDO_SUCCESS) { + finish_data_vio(data_vio, result); + return; + } + + /* + * Prevent writes and read-modify-writes from blocking indefinitely on + * lock holders in the packer. + */ + if (!is_read_data_vio(lock_holder) && + cancel_vio_compression(lock_holder)) { + data_vio->compression.lock_holder = lock_holder; + launch_data_vio_packer_callback(data_vio, + vdo_remove_lock_holder_from_packer); + } +} + +/** + * release_lock() - Release an uncontended LBN lock. + * @data_vio: The data_vio holding the lock. + */ +static void release_lock(struct data_vio *data_vio) +{ + struct lbn_lock *lock = &data_vio->logical; + struct int_map *lock_map = lock->zone->lbn_operations; + struct data_vio *lock_holder; + + if (!lock->locked) { + /* + * The lock is not locked, so it had better not be registered + * in the lock map. + */ + struct data_vio *lock_holder = int_map_get(lock_map, lock->lbn); + + ASSERT_LOG_ONLY((data_vio != lock_holder), + "no logical block lock held for block %llu", + (unsigned long long) lock->lbn); + return; + } + + /* Remove the lock from the logical block lock map, releasing the lock. */ + lock_holder = int_map_remove(lock_map, lock->lbn); + ASSERT_LOG_ONLY((data_vio == lock_holder), + "logical block lock mismatch for block %llu", + (unsigned long long) lock->lbn); + lock->locked = false; + return; +} + +/** + * vdo_release_logical_block_lock() - Release the lock on the logical block, + * if any, that a data_vio has acquired. + * @data_vio: The data_vio releasing its logical block lock. + */ +void vdo_release_logical_block_lock(struct data_vio *data_vio) +{ + struct data_vio *lock_holder, *next_lock_holder; + struct lbn_lock *lock = &data_vio->logical; + int result; + + assert_data_vio_in_logical_zone(data_vio); + if (!has_waiters(&data_vio->logical.waiters)) { + release_lock(data_vio); + return; + } + + ASSERT_LOG_ONLY(lock->locked, "lbn_lock with waiters is not locked"); + + /* + * Another data_vio is waiting for the lock, so just transfer it in a + * single lock map operation + */ + next_lock_holder = + waiter_as_data_vio(dequeue_next_waiter(&lock->waiters)); + + /* Transfer the remaining lock waiters to the next lock holder. */ + transfer_all_waiters(&lock->waiters, + &next_lock_holder->logical.waiters); + + result = int_map_put(lock->zone->lbn_operations, + lock->lbn, + next_lock_holder, + true, + (void **) &lock_holder); + if (result != VDO_SUCCESS) { + finish_data_vio(next_lock_holder, result); + return; + } + + ASSERT_LOG_ONLY((lock_holder == data_vio), + "logical block lock mismatch for block %llu", + (unsigned long long) lock->lbn); + lock->locked = false; + + /* + * If there are still waiters, other data_vios must be trying to get + * the lock we just transferred. We must ensure that the new lock + * holder doesn't block in the packer. + */ + if (has_waiters(&next_lock_holder->logical.waiters)) { + cancel_vio_compression(next_lock_holder); + } + + /* + * Avoid stack overflow on lock transfer. + * XXX: this is only an issue in the 1 thread config. + */ + data_vio_as_completion(next_lock_holder)->requeue = true; + launch_locked_request(next_lock_holder); +} + +/** + * data_vio_allocate_data_block() - Allocate a data block. + * + * @data_vio: The data_vio which needs an allocation. + * @write_lock_type: The type of write lock to obtain on the block. + * @callback: The callback which will attempt an allocation in the current + * zone and continue if it succeeds. + * @error_handler: The handler for errors while allocating. + */ +void data_vio_allocate_data_block(struct data_vio *data_vio, + enum pbn_lock_type write_lock_type, + vdo_action *callback, + vdo_action *error_handler) +{ + struct vdo *vdo = vdo_from_data_vio(data_vio); + struct allocation *allocation = &data_vio->allocation; + struct allocation_selector *selector = + data_vio->logical.zone->selector; + + ASSERT_LOG_ONLY((allocation->pbn == VDO_ZERO_BLOCK), + "data_vio does not have an allocation"); + allocation->write_lock_type = write_lock_type; + allocation->first_allocation_zone = + vdo_get_next_allocation_zone(selector); + allocation->zone = + &vdo->physical_zones->zones[allocation->first_allocation_zone]; + + data_vio_as_completion(data_vio)->error_handler = error_handler; + launch_data_vio_allocated_zone_callback(data_vio, callback); +} + +void release_data_vio_allocation_lock(struct data_vio *data_vio, bool reset) +{ + struct allocation *allocation = &data_vio->allocation; + physical_block_number_t locked_pbn = allocation->pbn; + + assert_data_vio_in_allocated_zone(data_vio); + + if (reset || + vdo_pbn_lock_has_provisional_reference(allocation->lock)) { + allocation->pbn = VDO_ZERO_BLOCK; + } + + vdo_release_physical_zone_pbn_lock(allocation->zone, + locked_pbn, + UDS_FORGET(allocation->lock)); +} + +void acknowledge_data_vio(struct data_vio *data_vio) +{ + struct vdo *vdo = vdo_from_data_vio(data_vio); + struct bio *bio = data_vio->user_bio; + int error = vdo_map_to_system_error(data_vio_as_completion(data_vio)->result); + + if (bio == NULL) { + return; + } + + ASSERT_LOG_ONLY((data_vio->remaining_discard <= + (uint32_t) (VDO_BLOCK_SIZE - data_vio->offset)), + "data_vio to acknowledge is not an incomplete discard"); + + data_vio->user_bio = NULL; + vdo_count_bios(&vdo->stats.bios_acknowledged, bio); + if (data_vio->is_partial) { + vdo_count_bios(&vdo->stats.bios_acknowledged_partial, bio); + } + + + vdo_complete_bio(bio, error); +} + +/** + * compress_data_vio() - A function to compress the data in a data_vio. + * @data_vio: The data_vio to compress. + */ +void compress_data_vio(struct data_vio *data_vio) +{ + int size; + char *context = get_work_queue_private_data(); + + /* + * By putting the compressed data at the start of the compressed + * block data field, we won't need to copy it if this data_vio + * becomes a compressed write agent. + */ + size = LZ4_compress_default(data_vio->data_block, + data_vio->compression.block->data, + VDO_BLOCK_SIZE, + VDO_MAX_COMPRESSED_FRAGMENT_SIZE, + context); + if (size > 0) { + data_vio->compression.size = size; + } else { + /* + * Use block size plus one as an indicator for uncompressible + * data. + */ + data_vio->compression.size = VDO_BLOCK_SIZE + 1; + } +} + +/** + * uncompress_data_vio() - A function to uncompress the data a data_vio has + * just read. + * @data_vio: The data_vio to uncompress. + * @mapping_state: The mapping state indicating which fragment to decompress. + * @buffer: The buffer to receive the uncompressed data. + */ +int uncompress_data_vio(struct data_vio *data_vio, + enum block_mapping_state mapping_state, + char *buffer) +{ + int size; + uint16_t fragment_offset, fragment_size; + struct compressed_block *block = data_vio->compression.block; + int result = vdo_get_compressed_block_fragment(mapping_state, + block, + &fragment_offset, + &fragment_size); + + if (result != VDO_SUCCESS) { + uds_log_debug("%s: compressed fragment error %d", + __func__, + result); + return result; + } + + size = LZ4_decompress_safe((block->data + fragment_offset), + buffer, + fragment_size, + VDO_BLOCK_SIZE); + if (size != VDO_BLOCK_SIZE) { + uds_log_debug("%s: lz4 error", __func__); + return VDO_INVALID_FRAGMENT; + } + + return VDO_SUCCESS; +} + +/* Return true if a data block contains all zeros. */ +bool is_zero_block(char *block) +{ + int i; + + + for (i = 0; i < VDO_BLOCK_SIZE; i += sizeof(uint64_t)) { + if (*((uint64_t *) &block[i])) + return false; + } + return true; +} diff --git a/vdo/data-vio.h b/vdo/data-vio.h new file mode 100644 index 00000000..68996b79 --- /dev/null +++ b/vdo/data-vio.h @@ -0,0 +1,1162 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright Red Hat + */ + +#ifndef DATA_VIO_H +#define DATA_VIO_H + +#include +#include +#include + +#include "permassert.h" +#include "uds.h" + +#include "block-mapping-state.h" +#include "completion.h" +#include "compressed-block.h" +#include "constants.h" +#include "hash-zone.h" +#include "journal-point.h" +#include "logical-zone.h" +#include "physical-zone.h" +#include "reference-operation.h" +#include "thread-config.h" +#include "types.h" +#include "vdo.h" +#include "vdo-page-cache.h" +#include "vio.h" +#include "wait-queue.h" + +/* + * Codes for describing the last asynchronous operation performed on a vio. + */ +enum async_operation_number { + MIN_VIO_ASYNC_OPERATION_NUMBER, + VIO_ASYNC_OP_LAUNCH = MIN_VIO_ASYNC_OPERATION_NUMBER, + VIO_ASYNC_OP_ACKNOWLEDGE_WRITE, + VIO_ASYNC_OP_ACQUIRE_VDO_HASH_LOCK, + VIO_ASYNC_OP_ATTEMPT_LOGICAL_BLOCK_LOCK, + VIO_ASYNC_OP_LOCK_DUPLICATE_PBN, + VIO_ASYNC_OP_CHECK_FOR_DUPLICATION, + VIO_ASYNC_OP_CLEANUP, + VIO_ASYNC_OP_COMPRESS_DATA_VIO, + VIO_ASYNC_OP_FIND_BLOCK_MAP_SLOT, + VIO_ASYNC_OP_GET_MAPPED_BLOCK_FOR_READ, + VIO_ASYNC_OP_GET_MAPPED_BLOCK_FOR_DEDUPE, + VIO_ASYNC_OP_GET_MAPPED_BLOCK_FOR_WRITE, + VIO_ASYNC_OP_HASH_DATA_VIO, + VIO_ASYNC_OP_JOURNAL_DECREMENT_FOR_DEDUPE, + VIO_ASYNC_OP_JOURNAL_DECREMENT_FOR_WRITE, + VIO_ASYNC_OP_JOURNAL_INCREMENT_FOR_COMPRESSION, + VIO_ASYNC_OP_JOURNAL_INCREMENT_FOR_DEDUPE, + VIO_ASYNC_OP_JOURNAL_INCREMENT_FOR_WRITE, + VIO_ASYNC_OP_JOURNAL_MAPPING_FOR_COMPRESSION, + VIO_ASYNC_OP_JOURNAL_MAPPING_FOR_DEDUPE, + VIO_ASYNC_OP_JOURNAL_MAPPING_FOR_WRITE, + VIO_ASYNC_OP_JOURNAL_UNMAPPING_FOR_DEDUPE, + VIO_ASYNC_OP_JOURNAL_UNMAPPING_FOR_WRITE, + VIO_ASYNC_OP_ATTEMPT_PACKING, + VIO_ASYNC_OP_PUT_MAPPED_BLOCK_FOR_WRITE, + VIO_ASYNC_OP_PUT_MAPPED_BLOCK_FOR_DEDUPE, + VIO_ASYNC_OP_READ_DATA_VIO, + VIO_ASYNC_OP_UPDATE_DEDUPE_INDEX, + VIO_ASYNC_OP_VERIFY_DUPLICATION, + VIO_ASYNC_OP_WRITE_DATA_VIO, + MAX_VIO_ASYNC_OPERATION_NUMBER, +} __packed; + +/* + * An LBN lock. + */ +struct lbn_lock { + /* The LBN being locked */ + logical_block_number_t lbn; + /* Whether the lock is locked */ + bool locked; + /* The queue of waiters for the lock */ + struct wait_queue waiters; + /* The logical zone of the LBN */ + struct logical_zone *zone; +}; + +/* + * A position in the arboreal block map at a specific level. + */ +struct block_map_tree_slot { + page_number_t page_index; + struct block_map_slot block_map_slot; +}; + +/* + * Fields for using the arboreal block map. + */ +struct tree_lock { + /* The current height at which this data_vio is operating */ + height_t height; + /* The block map tree for this LBN */ + root_count_t root_index; + /* Whether we hold a page lock */ + bool locked; + /* The thread on which to run the callback */ + thread_id_t thread_id; + /* The function to call after looking up a block map slot */ + vdo_action *callback; + /* The key for the lock map */ + uint64_t key; + /* + * The queue of waiters for the page this vio is allocating or loading + */ + struct wait_queue waiters; + /* The block map tree slots for this LBN */ + struct block_map_tree_slot tree_slots[VDO_BLOCK_MAP_TREE_HEIGHT + 1]; +}; + +struct compression_state { + /* + * The current compression state of this vio. This field contains a + * value which consists of a vio_compression_state possibly ORed with a + * flag indicating that a request has been made to cancel (or prevent) + * compression for this vio. + * + * This field should be accessed through the + * get_vio_compression_state() and set_vio_compression_state() methods. + * It should not be accessed directly. + */ + atomic_t state; + + /* The compressed size of this block */ + uint16_t size; + + /* + * The packer input or output bin slot which holds the enclosing + * data_vio + */ + slot_number_t slot; + + /* The packer bin to which the enclosing data_vio has been assigned */ + struct packer_bin *bin; + + /* A link in the chain of data_vios which have been packed together */ + struct data_vio *next_in_batch; + + /* + * A vio which is blocked in the packer while holding a lock this vio + * needs. + */ + struct data_vio *lock_holder; + + /* + * The compressed block used to hold the compressed form of this block + * and that of any other blocks for which this data_vio is the + * compressed write agent. + */ + struct compressed_block *block; +}; + +/* Fields supporting allocation of data blocks. */ +struct allocation { + /* The physical zone in which to allocate a physical block */ + struct physical_zone *zone; + + /* The block allocated to this vio */ + physical_block_number_t pbn; + + /* + * If non-NULL, the pooled PBN lock held on the allocated block. Must + * be a write lock until the block has been written, after which it + * will become a read lock. + */ + struct pbn_lock *lock; + + /* The type of write lock to obtain on the allocated block */ + enum pbn_lock_type write_lock_type; + + /* The zone which was the start of the current allocation cycle */ + zone_count_t first_allocation_zone; + + /* Whether this vio should wait for a clean slab */ + bool wait_for_clean_slab; +}; + +/* Dedupe support */ +struct dedupe_context { + struct uds_request uds_request; + struct list_head pending_list; + uint64_t submission_jiffies; + atomic_t request_state; + int status; + bool is_pending; +}; + +/* + * A vio for processing user data requests. + */ +struct data_vio { + /* The underlying struct vio */ + struct vio vio; + + /* The wait_queue entry structure */ + struct waiter waiter; + + /* The logical block of this request */ + struct lbn_lock logical; + + /* The state for traversing the block map tree */ + struct tree_lock tree_lock; + + /* The current partition address of this block */ + struct zoned_pbn mapped; + + /* The hash of this vio (if not zero) */ + struct uds_chunk_name chunk_name; + + /* Used for logging and debugging */ + enum async_operation_number last_async_operation; + + /* The operation to record in the recovery and slab journals */ + struct reference_operation operation; + + /* The type of request this data_vio is servicing */ + enum data_vio_operation io_operation; + + /* Whether this vio contains all zeros */ + bool is_zero_block; + + /* Whether this vio write is a duplicate */ + bool is_duplicate; + + /* Data block allocation */ + struct allocation allocation; + + /* + * Whether this vio has received an allocation. This field is examined + * from threads not in the allocation zone. + */ + bool allocation_succeeded; + + /* + * The new partition address of this block after the vio write + * completes + */ + struct zoned_pbn new_mapped; + + /* + * The hash zone responsible for the chunk name (NULL if is_zero_block) + */ + struct hash_zone *hash_zone; + + /* + * The lock this vio holds or shares with other vios with the same data + */ + struct hash_lock *hash_lock; + + /* + * All data_vios sharing a hash lock are kept in a list linking these + * list entries + */ + struct list_head hash_lock_entry; + + /* + * The block number in the partition of the UDS deduplication advice + */ + struct zoned_pbn duplicate; + + /* + * The sequence number of the recovery journal block containing the + * increment entry for this vio. + */ + sequence_number_t recovery_sequence_number; + + /* + * The point in the recovery journal where this write last made an + * entry + */ + struct journal_point recovery_journal_point; + + /* The list of vios in user initiated write requests */ + struct list_head write_entry; + + /* + * A flag indicating that a data write vio has a flush generation lock + */ + bool has_flush_generation_lock; + + /* The generation number of the VDO that this vio belongs to */ + sequence_number_t flush_generation; + + /* The completion to use for fetching block map pages for this vio */ + struct vdo_page_completion page_completion; + + /* The user bio that initiated this VIO */ + struct bio *user_bio; + + /* partial block support */ + block_size_t offset; + bool is_partial; + + /* + * The number of bytes to be discarded. For discards, this field will + * always be positive, whereas for non-discards it will always be 0. + * Hence it can be used to determine whether a data_vio is processing + * a discard, even after the user_bio has been acknowledged. + */ + uint32_t remaining_discard; + + /* + * Fields beyond this point will not be reset when a pooled data_vio + * is reused. + */ + + /* Dedupe */ + struct dedupe_context dedupe_context; + + /* All of the fields necessary for the compression path */ + struct compression_state compression; + + /* + * A copy of user data written, so we can do additional processing + * (dedupe, compression) after acknowledging the I/O operation and + * thus losing access to the original data. + * + * Also used as buffer space for read-modify-write cycles when + * emulating smaller-than-blockSize I/O operations. + */ + char *data_block; + + /* A block used as output during compression or uncompression */ + char *scratch_block; + + /* The data_vio pool list entry */ + struct list_head pool_entry; +}; + +/** + * vio_as_data_vio() - Convert a vio to a data_vio. + * @vio: The vio to convert. + * + * Return: The vio as a data_vio. + */ +static inline struct data_vio *vio_as_data_vio(struct vio *vio) +{ + ASSERT_LOG_ONLY((vio->type == VIO_TYPE_DATA), "vio is a data_vio"); + return container_of(vio, struct data_vio, vio); +} + +/** + * data_vio_as_vio() - Convert a data_vio to a vio. + * @data_vio: The data_vio to convert. + * + * Return: The data_vio as a vio. + */ +static inline struct vio *data_vio_as_vio(struct data_vio *data_vio) +{ + return &data_vio->vio; +} + +/** + * as_data_vio() - Convert a generic vdo_completion to a data_vio. + * @completion: The completion to convert. + * + * Return: The completion as a data_vio. + */ +static inline struct data_vio *as_data_vio(struct vdo_completion *completion) +{ + return vio_as_data_vio(as_vio(completion)); +} + +/** + * data_vio_as_completion() - Convert a data_vio to a generic completion. + * @data_vio: The data_vio to convert. + * + * Return: The data_vio as a completion. + */ +static inline struct vdo_completion * +data_vio_as_completion(struct data_vio *data_vio) +{ + return vio_as_completion(data_vio_as_vio(data_vio)); +} + +static inline struct data_vio * +data_vio_from_funnel_queue_entry(struct funnel_queue_entry *entry) +{ + return as_data_vio(container_of(entry, + struct vdo_completion, + work_queue_entry_link)); +} + +/** + * data_vio_as_waiter() - Convert a data_vio to a generic wait queue entry. + * @data_vio: The data_vio to convert. + * + * Return: The data_vio as a wait queue entry. + */ +static inline struct waiter *data_vio_as_waiter(struct data_vio *data_vio) +{ + return &data_vio->waiter; +} + +/** + * waiter_as_data_vio() - Convert a data_vio's generic wait queue entry back + * to the data_vio. + * @waiter: The wait queue entry to convert. + * + * Return: The wait queue entry as a data_vio. + */ +static inline struct data_vio *waiter_as_data_vio(struct waiter *waiter) +{ + if (waiter == NULL) { + return NULL; + } + + return container_of(waiter, struct data_vio, waiter); +} + +/** + * is_read_data_vio() - Check whether a data_vio is a read. + * @data_vio: The data_vio to check. + */ +static inline bool is_read_data_vio(const struct data_vio *data_vio) +{ + return ((data_vio->io_operation & DATA_VIO_READ_WRITE_MASK) == + DATA_VIO_READ); +} + +/** + * is_write_data_vio() - Check whether a data_vio is a write. + * @data_vio: The data_vio to check. + */ +static inline bool is_write_data_vio(const struct data_vio *data_vio) +{ + return ((data_vio->io_operation & DATA_VIO_READ_WRITE_MASK) == + DATA_VIO_WRITE); +} + +/** + * is_read_modify_write_data_vio() - Check whether a data_vio is a + * read-modify-write. + * @data_vio: The data_vio. + * + * Return: true if the vio is a read-modify-write. + */ +static inline bool +is_read_modify_write_data_vio(const struct data_vio *data_vio) +{ + return ((data_vio->io_operation & DATA_VIO_READ_WRITE_MASK) == + DATA_VIO_READ_MODIFY_WRITE); +} + +/** + * is_trim_data_vio() - Check whether a data_vio is a trim. + * @data_vio: The data_vio to check. + * + * Return: true if the data_vio is a trim. + */ +static inline bool is_trim_data_vio(struct data_vio *data_vio) +{ + return (data_vio->new_mapped.state == VDO_MAPPING_STATE_UNMAPPED); +} + +/** + * data_vio_requires_fua() - Check whether a data_vio requires a FUA after + * doing its I/O. + * @data_vio: The data_vio. + * + * Return: true if the data_vio requires a FUA. + */ +static inline bool data_vio_requires_fua(const struct data_vio *data_vio) +{ + return ((data_vio->io_operation & DATA_VIO_FUA) == DATA_VIO_FUA); +} + +/** + * get_data_vio_new_advice() - Get the location that should be passed to UDS + * as the new advice for to find the data written + * by this data_vio. + * @data_vio: The write data_vio that is ready to update UDS. + * + * Return: a data_location containing the advice to store in UDS. + */ +static inline struct data_location +get_data_vio_new_advice(const struct data_vio *data_vio) +{ + return (struct data_location){ + .pbn = data_vio->new_mapped.pbn, + .state = data_vio->new_mapped.state, + }; +} + +/** + * vdo_from_data_vio() - Get the vdo from a data_vio. + * @data_vio: The data_vio from which to get the vdo. + * + * Return: The vdo to which a data_vio belongs. + */ +static inline struct vdo *vdo_from_data_vio(struct data_vio *data_vio) +{ + return data_vio_as_completion(data_vio)->vdo; +} + +/** + * get_thread_config_from_data_vio() - Get the struct thread_config from a + * data_vio. + * @data_vio: The data_vio from which to get the struct thread_config. + * + * Return: The struct thread_config of the vdo to which a data_vio belongs. + */ +static inline const struct thread_config * +get_thread_config_from_data_vio(struct data_vio *data_vio) +{ + return vdo_from_data_vio(data_vio)->thread_config; +} + +/** + * get_data_vio_allocation() - Get the allocation of a data_vio. + * @data_vio: The data_vio. + * + * Return: The allocation of the data_vio. + */ +static inline +physical_block_number_t get_data_vio_allocation(struct data_vio *data_vio) +{ + return data_vio->allocation.pbn; +} + +/** + * data_vio_has_allocation() - Check whether a data_vio has an allocation. + * @data_vio: The data_vio to check. + * + * Return: true if the data_vio has an allocated block. + */ +static inline bool data_vio_has_allocation(struct data_vio *data_vio) +{ + return (get_data_vio_allocation(data_vio) != VDO_ZERO_BLOCK); +} + +void destroy_data_vio(struct data_vio *data_vio); + +int __must_check initialize_data_vio(struct data_vio *data_vio); + +void launch_data_vio(struct data_vio *data_vio, + logical_block_number_t lbn, + enum data_vio_operation operation); + +void complete_data_vio(struct vdo_completion *completion); + +void finish_data_vio(struct data_vio *data_vio, int result); + +/** + * continue_data_vio() - Continue processing a data_vio that has been waiting + * for an event, setting the result from the event and + * calling the current callback. + * @data_vio: The data_vio to continue. + * + * Return: The current result (will not mask older errors). + */ +static inline void continue_data_vio(struct data_vio *data_vio, int result) +{ + vdo_continue_completion(data_vio_as_completion(data_vio), result); +} + +const char * __must_check +get_data_vio_operation_name(struct data_vio *data_vio); + +/** + * enqueue_data_vio() - Add a data_vio to the tail end of a wait queue. + * @queue: The queue to which to add the waiter. + * @waiter: The data_vio to add to the queue. + * + * The data_vio must not already be waiting in a queue. A trace record is also + * generated for the data_vio. + * + * Return: VDO_SUCCESS or an error code. + */ +static inline int __must_check +enqueue_data_vio(struct wait_queue *queue, + struct data_vio *waiter) +{ + return enqueue_waiter(queue, data_vio_as_waiter(waiter)); +} + +/** + * assert_data_vio_in_hash_zone() - Check that a data_vio is running on the + * correct thread for its hash zone. + * @data_vio: The data_vio in question. + */ +static inline void assert_data_vio_in_hash_zone(struct data_vio *data_vio) +{ + thread_id_t expected = vdo_get_hash_zone_thread_id(data_vio->hash_zone); + thread_id_t thread_id = vdo_get_callback_thread_id(); + /* + * It's odd to use the LBN, but converting the chunk name to hex is a + * bit clunky for an inline, and the LBN better than nothing as an + * identifier. + */ + ASSERT_LOG_ONLY((expected == thread_id), + "data_vio for logical block %llu on thread %u, should be on hash zone thread %u", + (unsigned long long) data_vio->logical.lbn, + thread_id, + expected); +} + +/** + * set_data_vio_hash_zone_callback() - Set a callback as a hash zone + * operation. + * @data_vio: The data_vio for which to set the callback. + * @callback: The callback to set. + * + * This function presumes that the hash_zone field of the data_vio has already + * been set. + */ +static inline void +set_data_vio_hash_zone_callback(struct data_vio *data_vio, + vdo_action *callback) +{ + vdo_set_completion_callback(data_vio_as_completion(data_vio), + callback, + vdo_get_hash_zone_thread_id(data_vio->hash_zone)); +} + +/** + * launch_data_vio_hash_zone_callback() - Set a callback as a hash zone + * operation and invoke it immediately. + * @data_vio: The data_vio for which to set the callback. + * @callback: The callback to set. + */ +static inline void +launch_data_vio_hash_zone_callback(struct data_vio *data_vio, + vdo_action *callback) +{ + set_data_vio_hash_zone_callback(data_vio, callback); + vdo_invoke_completion_callback(data_vio_as_completion(data_vio)); +} + +/** + * assert_data_vio_in_logical_zone() - Check that a data_vio is running on the + * correct thread for its logical zone. + * @data_vio: The data_vio in question. + */ +static inline void assert_data_vio_in_logical_zone(struct data_vio *data_vio) +{ + thread_id_t expected = data_vio->logical.zone->thread_id; + thread_id_t thread_id = vdo_get_callback_thread_id(); + + ASSERT_LOG_ONLY((expected == thread_id), + "data_vio for logical block %llu on thread %u, should be on thread %u", + (unsigned long long) data_vio->logical.lbn, + thread_id, + expected); +} + +/** + * set_data_vio_logical_callback() - Set a callback as a logical block + * operation. + * @data_vio: The data_vio for which to set the callback. + * @callback: The callback to set. + * + * This function presumes that the logical.zone field of the data_vio has + * already been set. + */ +static inline void +set_data_vio_logical_callback(struct data_vio *data_vio, + vdo_action *callback) +{ + vdo_set_completion_callback(data_vio_as_completion(data_vio), + callback, + data_vio->logical.zone->thread_id); +} + +/** + * launch_data_vio_logical_callback() - Set a callback as a logical block + * operation and invoke it immediately. + * @data_vio: The data_vio for which to set the callback. + * @callback: The callback to set. + */ +static inline void +launch_data_vio_logical_callback(struct data_vio *data_vio, + vdo_action *callback) +{ + set_data_vio_logical_callback(data_vio, callback); + vdo_invoke_completion_callback(data_vio_as_completion(data_vio)); +} + +/** + * assert_data_vio_in_allocated_zone() - Check that a data_vio is running on + * the correct thread for its allocated + * zone. + * @data_vio: The data_vio in question. + */ +static inline void assert_data_vio_in_allocated_zone(struct data_vio *data_vio) +{ + thread_id_t expected = data_vio->allocation.zone->thread_id; + thread_id_t thread_id = vdo_get_callback_thread_id(); + + ASSERT_LOG_ONLY((expected == thread_id), + "struct data_vio for allocated physical block %llu on thread %u, should be on thread %u", + (unsigned long long) data_vio->allocation.pbn, + thread_id, + expected); +} + +/** + * set_data_vio_allocated_zone_callback() - Set a callback as a physical block + * operation in a data_vio's + * allocated zone. + * @data_vio: The data_vio. + * @callback: The callback to set. + */ +static inline void +set_data_vio_allocated_zone_callback(struct data_vio *data_vio, + vdo_action *callback) +{ + vdo_set_completion_callback(data_vio_as_completion(data_vio), + callback, + data_vio->allocation.zone->thread_id); +} + +/** + * launch_data_vio_allocated_zone_callback() - Set a callback as a physical + * block operation in a data_vio's + * allocated zone and queue the + * data_vio and invoke it + * immediately. + * @data_vio: The data_vio. + * @callback: The callback to invoke. + */ +static inline void +launch_data_vio_allocated_zone_callback(struct data_vio *data_vio, + vdo_action *callback) +{ + set_data_vio_allocated_zone_callback(data_vio, callback); + vdo_invoke_completion_callback(data_vio_as_completion(data_vio)); +} + +/** + * assert_data_vio_in_duplicate_zone() - Check that a data_vio is running on + * the correct thread for its duplicate + * zone. + * @data_vio: The data_vio in question. + */ +static inline void assert_data_vio_in_duplicate_zone(struct data_vio *data_vio) +{ + thread_id_t expected = data_vio->duplicate.zone->thread_id; + thread_id_t thread_id = vdo_get_callback_thread_id(); + + ASSERT_LOG_ONLY((expected == thread_id), + "data_vio for duplicate physical block %llu on thread %u, should be on thread %u", + (unsigned long long) data_vio->duplicate.pbn, + thread_id, + expected); +} + +/** + * set_data_vio_duplicate_zone_callback() - Set a callback as a physical block + * operation in a data_vio's + * duplicate zone. + * @data_vio: The data_vio. + * @callback: The callback to set. + */ +static inline void +set_data_vio_duplicate_zone_callback(struct data_vio *data_vio, + vdo_action *callback) +{ + vdo_set_completion_callback(data_vio_as_completion(data_vio), + callback, + data_vio->duplicate.zone->thread_id); +} + +/** + * launch_data_vio_duplicate_zone_callback() - Set a callback as a physical + * block operation in a data_vio's + * duplicate zone and queue the + * data_vio and invoke it + * immediately. + * @data_vio: The data_vio. + * @callback: The callback to invoke. + */ +static inline void +launch_data_vio_duplicate_zone_callback(struct data_vio *data_vio, + vdo_action *callback) +{ + set_data_vio_duplicate_zone_callback(data_vio, callback); + vdo_invoke_completion_callback(data_vio_as_completion(data_vio)); +} + +/** + * assert_data_vio_in_mapped_zone() - Check that a data_vio is running on the + * correct thread for its mapped zone. + * @data_vio: The data_vio in question. + */ +static inline void assert_data_vio_in_mapped_zone(struct data_vio *data_vio) +{ + thread_id_t expected = data_vio->mapped.zone->thread_id; + thread_id_t thread_id = vdo_get_callback_thread_id(); + + ASSERT_LOG_ONLY((expected == thread_id), + "data_vio for mapped physical block %llu on thread %u, should be on thread %u", + (unsigned long long) data_vio->mapped.pbn, + thread_id, + expected); +} + +/** + * set_data_vio_mapped_zone_callback() - Set a callback as a physical block + * operation in a data_vio's mapped + * zone. + * @data_vio: The data_vio. + * @callback: The callback to set. + */ +static inline void +set_data_vio_mapped_zone_callback(struct data_vio *data_vio, + vdo_action *callback) +{ + vdo_set_completion_callback(data_vio_as_completion(data_vio), + callback, + data_vio->mapped.zone->thread_id); +} + +/** + * assert_data_vio_in_new_mapped_zone() - Check that a data_vio is running on + * the correct thread for its + * new_mapped zone. + * @data_vio: The data_vio in question. + */ +static inline void assert_data_vio_in_new_mapped_zone(struct data_vio *data_vio) +{ + thread_id_t expected = data_vio->new_mapped.zone->thread_id; + thread_id_t thread_id = vdo_get_callback_thread_id(); + + ASSERT_LOG_ONLY((expected == thread_id), + "data_vio for new_mapped physical block %llu on thread %u, should be on thread %u", + (unsigned long long) data_vio->new_mapped.pbn, + thread_id, + expected); +} + +/** + * set_data_vio_new_mapped_zone_callback() - Set a callback as a physical + * block operation in a data_vio's + * new_mapped zone. + * @data_vio: The data_vio. + * @callback: The callback to set. + */ +static inline void +set_data_vio_new_mapped_zone_callback(struct data_vio *data_vio, + vdo_action *callback) +{ + vdo_set_completion_callback(data_vio_as_completion(data_vio), + callback, + data_vio->new_mapped.zone->thread_id); +} + +/** + * assert_data_vio_in_journal_zone() - Check that a data_vio is running on the + * journal thread. + * @data_vio: The data_vio in question. + */ +static inline void assert_data_vio_in_journal_zone(struct data_vio *data_vio) +{ + thread_id_t journal_thread = + get_thread_config_from_data_vio(data_vio)->journal_thread; + thread_id_t thread_id = vdo_get_callback_thread_id(); + + ASSERT_LOG_ONLY((journal_thread == thread_id), + "data_vio for logical block %llu on thread %u, should be on journal thread %u", + (unsigned long long) data_vio->logical.lbn, + thread_id, + journal_thread); +} + +/** + * set_data_vio_journal_callback() - Set a callback as a journal operation. + * @data_vio: The data_vio for which to set the callback. + * @callback: The callback to set. + */ +static inline void +set_data_vio_journal_callback(struct data_vio *data_vio, + vdo_action *callback) +{ + thread_id_t journal_thread = + get_thread_config_from_data_vio(data_vio)->journal_thread; + vdo_set_completion_callback(data_vio_as_completion(data_vio), + callback, + journal_thread); +} + +/** + * launch_data_vio_journal_callback() - Set a callback as a journal operation + * and invoke it immediately. + * @data_vio: The data_vio for which to set the callback. + * @callback: The callback to set. + */ +static inline void +launch_data_vio_journal_callback(struct data_vio *data_vio, + vdo_action *callback) +{ + set_data_vio_journal_callback(data_vio, callback); + vdo_invoke_completion_callback(data_vio_as_completion(data_vio)); +} + +/** + * assert_data_vio_in_packer_zone() - Check that a data_vio is running on the + * packer thread. + * @data_vio: The data_vio in question. + */ +static inline void assert_data_vio_in_packer_zone(struct data_vio *data_vio) +{ + thread_id_t packer_thread = + get_thread_config_from_data_vio(data_vio)->packer_thread; + thread_id_t thread_id = vdo_get_callback_thread_id(); + + ASSERT_LOG_ONLY((packer_thread == thread_id), + "data_vio for logical block %llu on thread %u, should be on packer thread %u", + (unsigned long long) data_vio->logical.lbn, + thread_id, + packer_thread); +} + +/** + * set_data_vio_packer_callback() - Set a callback as a packer operation. + * @data_vio: The data_vio for which to set the callback. + * @callback: The callback to set. + */ +static inline void +set_data_vio_packer_callback(struct data_vio *data_vio, + vdo_action *callback) +{ + thread_id_t packer_thread = + get_thread_config_from_data_vio(data_vio)->packer_thread; + vdo_set_completion_callback(data_vio_as_completion(data_vio), + callback, + packer_thread); +} + +/** + * launch_data_vio_packer_callback() - Set a callback as a packer operation + * and invoke it immediately. + * @data_vio: The data_vio for which to set the callback. + * @callback: The callback to set. + */ +static inline void +launch_data_vio_packer_callback(struct data_vio *data_vio, + vdo_action *callback) +{ + set_data_vio_packer_callback(data_vio, callback); + vdo_invoke_completion_callback(data_vio_as_completion(data_vio)); +} + +/** + * assert_data_vio_on_cpu_thread() - Check that a data_vio is running on the + * packer thread. + * @data_vio: The data_vio in question. + */ +static inline void assert_data_vio_on_cpu_thread(struct data_vio *data_vio) +{ + thread_id_t cpu_thread = + get_thread_config_from_data_vio(data_vio)->cpu_thread; + thread_id_t thread_id = vdo_get_callback_thread_id(); + + ASSERT_LOG_ONLY((cpu_thread == thread_id), + "data_vio for logical block %llu on thread %u, should be on cpu thread %u", + (unsigned long long) data_vio->logical.lbn, + thread_id, + cpu_thread); +} + +/** + * set_data_vio_dedupe_callback() - Set a callback as a dedupe queue + * operation. + * @data_vio: The data_vio for which to set the callback. + * @callback: The callback to set. + */ +static inline void +set_data_vio_dedupe_callback(struct data_vio *data_vio, + vdo_action *callback) +{ + thread_id_t dedupe_thread = + get_thread_config_from_data_vio(data_vio)->dedupe_thread; + vdo_set_completion_callback(data_vio_as_completion(data_vio), + callback, + dedupe_thread); +} + +/** + * launch_data_vio_dedupe_callback() - Set a callback to run on the dedupe + * queue and invoke it immediately. + * @data_vio: The data_vio for which to set the callback. + * @callback: The callback to set. + */ +static inline void +launch_data_vio_dedupe_callback(struct data_vio *data_vio, + vdo_action *callback) +{ + struct vdo_completion *completion = data_vio_as_completion(data_vio); + + set_data_vio_dedupe_callback(data_vio, callback); + vdo_invoke_completion_callback(completion); +} + +/** + * set_data_vio_cpu_callback() - Set a callback as a CPU queue operation. + * @data_vio: The data_vio for which to set the callback. + * @callback: The callback to set. + */ +static inline void +set_data_vio_cpu_callback(struct data_vio *data_vio, + vdo_action *callback) +{ + thread_id_t cpu_thread = + get_thread_config_from_data_vio(data_vio)->cpu_thread; + vdo_set_completion_callback(data_vio_as_completion(data_vio), + callback, + cpu_thread); +} + +/** + * launch_data_vio_cpu_callback() - Set a callback to run on the CPU queues + * and invoke it immediately. + * @data_vio: The data_vio for which to set the callback. + * @callback: The callback to set. + * @priority: The priority with which to run the callback. + */ +static inline void +launch_data_vio_cpu_callback(struct data_vio *data_vio, + vdo_action *callback, + enum vdo_completion_priority priority) +{ + struct vdo_completion *completion = data_vio_as_completion(data_vio); + + set_data_vio_cpu_callback(data_vio, callback); + vdo_invoke_completion_callback_with_priority(completion, priority); +} + +/** + * set_data_vio_bio_zone_callback() - Set a callback as a bio zone operation. + * @data_vio: The data_vio for which to set the callback. + * @callback: The callback to set. + * + * This function assumes that the physical field of the data_vio's vio has + * already been set to the pbn to which I/O will be performed. + */ +static inline void +set_data_vio_bio_zone_callback(struct data_vio *data_vio, + vdo_action *callback) +{ + struct vio *vio = data_vio_as_vio(data_vio); + + vdo_set_completion_callback(vio_as_completion(vio), + callback, + get_vio_bio_zone_thread_id(vio)); +} + +/** + * launch_data_vio_bio_zone_callback() - Set a callback as a bio zone + * operation and invoke it immediately. + * @data_vio: The data_vio for which to set the callback. + * @callback: The callback to set. + */ +static inline void +launch_data_vio_bio_zone_callback(struct data_vio *data_vio, + vdo_action *callback) +{ + struct vdo_completion *completion = data_vio_as_completion(data_vio); + + set_data_vio_bio_zone_callback(data_vio, callback); + vdo_invoke_completion_callback_with_priority(completion, + BIO_Q_DATA_PRIORITY); +} + +/** + * launch_data_vio_on_bio_ack_queue() - If the vdo uses a bio_ack queue, set a + * callback to run on it and invoke it + * immediately, otherwise, just run the + * callback on the current thread. + * @data_vio: The data_vio for which to set the callback. + * @callback: The callback to set. + */ +static inline void +launch_data_vio_on_bio_ack_queue(struct data_vio *data_vio, + vdo_action *callback) +{ + struct vdo *vdo = vdo_from_data_vio(data_vio); + struct vdo_completion *completion = data_vio_as_completion(data_vio); + + if (!vdo_uses_bio_ack_queue(vdo)) { + callback(completion); + return; + } + + vdo_set_completion_callback(completion, + callback, + vdo->thread_config->bio_ack_thread); + vdo_invoke_completion_callback_with_priority(completion, + BIO_ACK_Q_ACK_PRIORITY); +} + +void set_data_vio_duplicate_location(struct data_vio *data_vio, + const struct zoned_pbn source); + +void clear_data_vio_mapped_location(struct data_vio *data_vio); + +int __must_check set_data_vio_mapped_location(struct data_vio *data_vio, + physical_block_number_t pbn, + enum block_mapping_state state); + +void vdo_release_logical_block_lock(struct data_vio *data_vio); + +void data_vio_allocate_data_block(struct data_vio *data_vio, + enum pbn_lock_type write_lock_type, + vdo_action *callback, + vdo_action *error_handler); + +/** + * release_data_vio_allocation_lock() - Release the PBN lock on a data_vio's + * allocated block. + * @data_vio: The lock holder. + * @reset: If true, the allocation will be reset (i.e. any allocated pbn will + * be forgotten). + * + * If the reference to the locked block is still provisional, it will be + * released as well. + */ +void release_data_vio_allocation_lock(struct data_vio *data_vio, bool reset); + +void acknowledge_data_vio(struct data_vio *data_vio); + +void compress_data_vio(struct data_vio *data_vio); + +int __must_check uncompress_data_vio(struct data_vio *data_vio, + enum block_mapping_state mapping_state, + char *buffer); + +/** + * Prepare a data_vio's vio and bio to submit I/O. + * + * @data_vio: The vio preparing to issue I/O + * @data: The buffer to write from or read into + * @callback: The callback the bio should call when the I/O finishes + * @bi_opf: The operation and flags for the bio + * @pbn: The pbn to which the I/O will be addressed + * + * Return: VDO_SUCCESS or an error + */ +static inline int __must_check +prepare_data_vio_for_io(struct data_vio *data_vio, + char *data, + bio_end_io_t callback, + unsigned int bi_opf, + physical_block_number_t pbn) +{ + struct vio *vio = data_vio_as_vio(data_vio); + + set_vio_physical(vio, pbn); + return prepare_vio_for_io(vio, + data, + callback, + bi_opf); +} + +bool is_zero_block(char *block); + +#endif /* DATA_VIO_H */ diff --git a/vdo/dataKVIO.c b/vdo/dataKVIO.c deleted file mode 100644 index bae0a638..00000000 --- a/vdo/dataKVIO.c +++ /dev/null @@ -1,1155 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/kernel/dataKVIO.c#55 $ - */ - -#include "dataKVIO.h" - -#include -#include - -#include "logger.h" -#include "memoryAlloc.h" -#include "murmur/MurmurHash3.h" -#include "permassert.h" - -#include "atomicStats.h" -#include "compressedBlock.h" -#include "dataVIO.h" -#include "hashLock.h" -#include "vdoInternal.h" - -#include "bio.h" -#include "dedupeIndex.h" -#include "kvio.h" -#include "ioSubmitter.h" -#include "vdoCommon.h" - -static void dump_pooled_data_vio(void *data); - -/** - * For certain flags set on user bios, if the user bio has not yet been - * acknowledged, setting those flags on our own bio(s) for that request may - * help underlying layers better fulfill the user bio's needs. This constant - * contains the aggregate of those flags; VDO strips all the other flags, as - * they convey incorrect information. - * - * These flags are always irrelevant if we have already finished the user bio - * as they are only hints on IO importance. If VDO has finished the user bio, - * any remaining IO done doesn't care how important finishing the finished bio - * was. - * - * Note that kernelLayer.c contains the complete list of flags we believe may - * be set; the following list explains the action taken with each of those - * flags VDO could receive: - * - * REQ_SYNC: Passed down if the user bio is not yet completed, since it - * indicates the user bio completion is required for further work to be - * done by the issuer. - * REQ_META: Passed down if the user bio is not yet completed, since it may - * mean the lower layer treats it as more urgent, similar to REQ_SYNC. - * REQ_PRIO: Passed down if the user bio is not yet completed, since it - * indicates the user bio is important. - * REQ_NOMERGE: Set only if the incoming bio was split; irrelevant to VDO IO. - * REQ_IDLE: Set if the incoming bio had more IO quickly following; VDO's IO - * pattern doesn't match incoming IO, so this flag is incorrect for it. - * REQ_FUA: Handled separately, and irrelevant to VDO IO otherwise. - * REQ_RAHEAD: Passed down, as, for reads, it indicates trivial importance. - * REQ_BACKGROUND: Not passed down, as VIOs are a limited resource and VDO - * needs them recycled ASAP to service heavy load, which is the only place - * where REQ_BACKGROUND might aid in load prioritization. - **/ -static unsigned int PASSTHROUGH_FLAGS = - (REQ_PRIO | REQ_META | REQ_SYNC | REQ_RAHEAD); - -/**********************************************************************/ -static void vdo_acknowledge_data_vio(struct data_vio *data_vio) -{ - struct vio *vio = data_vio_as_vio(data_vio); - int error = map_to_system_error(vio_as_completion(vio)->result); - struct bio *bio = data_vio->user_bio; - - - if (bio == NULL) { - return; - } - data_vio->user_bio = NULL; - - vdo_count_bios(&vio->vdo->stats.bios_acknowledged, bio); - if (data_vio->is_partial) { - vdo_count_bios(&vio->vdo->stats.bios_acknowledged_partial, - bio); - } - - - vdo_complete_bio(bio, error); -} - -/**********************************************************************/ -static noinline void clean_data_vio(struct data_vio *data_vio, - struct free_buffer_pointers *fbp) -{ - vdo_acknowledge_data_vio(data_vio); - add_free_buffer_pointer(fbp, data_vio); -} - -/**********************************************************************/ -void return_data_vio_batch_to_pool(struct batch_processor *batch, - void *closure) -{ - struct free_buffer_pointers fbp; - struct vdo_work_item *item; - struct vdo *vdo = closure; - uint32_t count = 0; - - ASSERT_LOG_ONLY(batch != NULL, "batch not null"); - ASSERT_LOG_ONLY(vdo != NULL, "vdo not null"); - - - init_free_buffer_pointers(&fbp, vdo->data_vio_pool); - - while ((item = next_batch_item(batch)) != NULL) { - clean_data_vio(work_item_as_data_vio(item), &fbp); - count++; - } - - if (fbp.index > 0) { - free_buffer_pointers(&fbp); - } - - complete_many_requests(vdo, count); -} - -/**********************************************************************/ -static void -vdo_acknowledge_and_batch(struct vdo_work_item *item) -{ - struct data_vio *data_vio = work_item_as_data_vio(item); - struct vdo *vdo = get_vdo_from_data_vio(data_vio); - vdo_acknowledge_data_vio(data_vio); - add_to_batch_processor(vdo->data_vio_releaser, item); -} - -/**********************************************************************/ -static void vdo_complete_data_vio(struct vdo_completion *completion) -{ - struct data_vio *data_vio = as_data_vio(completion); - struct vdo *vdo = get_vdo_from_data_vio(data_vio); - - if (use_bio_ack_queue(vdo) && VDO_USE_BIO_ACK_QUEUE_FOR_READ && - (data_vio->user_bio != NULL)) { - launch_data_vio_on_bio_ack_queue(data_vio, - vdo_acknowledge_and_batch, - NULL, - BIO_ACK_Q_ACTION_ACK); - } else { - add_to_batch_processor(vdo->data_vio_releaser, - &completion->work_item); - } -} - -/** - * For a read, dispatch the freshly uncompressed data to its destination: - * - for a 4k read, copy it into the user bio for later acknowlegement; - * - * - for a partial read, invoke its callback; vdo_complete_partial_read will - * copy the data into the user bio for acknowledgement; - * - * - for a partial write, copy it into the data block, so that we can later - * copy data from the user bio atop it in vdo_apply_partial_write and treat - * it as a full-block write. - * - * This is called from read_data_vio_read_block_callback, registered only in - * read_data_vio() and therefore never called on a 4k write. - * - * @param work_item The data_vio which requested the read - **/ -static void copy_read_block_data(struct vdo_work_item *work_item) -{ - struct data_vio *data_vio = work_item_as_data_vio(work_item); - - // For a read-modify-write, copy the data into the data_block buffer so - // it will be set up for the write phase. - if (is_read_modify_write_vio(data_vio_as_vio(data_vio))) { - memcpy(data_vio->data_block, data_vio->read_block.data, - VDO_BLOCK_SIZE); - enqueue_data_vio_callback(data_vio); - return; - } - - // For a partial read, the callback will copy the requested data from - // the read block. - if (data_vio->is_partial) { - enqueue_data_vio_callback(data_vio); - return; - } - - // For a 4k read, copy the data to the user bio and acknowledge. - vdo_bio_copy_data_out(data_vio->user_bio, data_vio->read_block.data); - acknowledge_data_vio(data_vio); -} - -/** - * Finish reading data for a compressed block. This callback is registered - * in read_data_vio() when trying to read compressed data for a 4k read or - * a partial read or write. - * - * @param completion The data_vio which requested the read - **/ -static void -read_data_vio_read_block_callback(struct vdo_completion *completion) -{ - struct data_vio *data_vio = as_data_vio(completion); - if (data_vio->read_block.status != VDO_SUCCESS) { - set_vdo_completion_result(completion, - data_vio->read_block.status); - enqueue_data_vio_callback(data_vio); - return; - } - - launch_data_vio_on_cpu_queue(data_vio, copy_read_block_data, NULL, - CPU_Q_ACTION_COMPRESS_BLOCK); -} - -/** - * Uncompress the data that's just been read and then call back the requesting - * data_vio. - * - * @param work_item The data_vio requesting the data - **/ -static void uncompress_read_block(struct vdo_work_item *work_item) -{ - struct vdo_completion *completion = container_of(work_item, - struct vdo_completion, - work_item); - struct data_vio *data_vio = work_item_as_data_vio(work_item); - struct read_block *read_block = &data_vio->read_block; - int size; - - // The data_vio's scratch block will be used to contain the - // uncompressed data. - uint16_t fragment_offset, fragment_size; - char *compressed_data = read_block->data; - int result = get_vdo_compressed_block_fragment(read_block->mapping_state, - compressed_data, - VDO_BLOCK_SIZE, - &fragment_offset, - &fragment_size); - if (result != VDO_SUCCESS) { - uds_log_debug("%s: frag err %d", __func__, result); - read_block->status = result; - read_block->callback(completion); - return; - } - - size = LZ4_decompress_safe((compressed_data + fragment_offset), - data_vio->scratch_block, fragment_size, - VDO_BLOCK_SIZE); - if (size == VDO_BLOCK_SIZE) { - read_block->data = data_vio->scratch_block; - } else { - uds_log_debug("%s: lz4 error", __func__); - read_block->status = VDO_INVALID_FRAGMENT; - } - - read_block->callback(completion); -} - -/** - * Now that we have gotten the data from storage, uncompress the data if - * necessary and then call back the requesting data_vio. - * - * @param data_vio The data_vio requesting the data - **/ -static void complete_read(struct data_vio *data_vio) -{ - struct read_block *read_block = &data_vio->read_block; - struct vio *vio = data_vio_as_vio(data_vio); - - read_block->status = blk_status_to_errno(vio->bio->bi_status); - - if ((read_block->status == VDO_SUCCESS) && - vdo_is_state_compressed(read_block->mapping_state)) { - launch_data_vio_on_cpu_queue(data_vio, - uncompress_read_block, - NULL, - CPU_Q_ACTION_COMPRESS_BLOCK); - return; - } - - read_block->callback(vio_as_completion(vio)); -} - -/** - * Callback for a bio doing a read. - * - * @param bio The bio - */ -static void read_bio_callback(struct bio *bio) -{ - struct data_vio *data_vio = (struct data_vio *) bio->bi_private; - data_vio->read_block.data = data_vio->read_block.buffer; - vdo_count_completed_bios(bio); - complete_read(data_vio); -} - -/**********************************************************************/ -void vdo_read_block(struct data_vio *data_vio, - physical_block_number_t location, - enum block_mapping_state mapping_state, - enum bio_q_action action, - vdo_action *callback) -{ - struct vio *vio = data_vio_as_vio(data_vio); - struct read_block *read_block = &data_vio->read_block; - int result; - - read_block->callback = callback; - read_block->status = VDO_SUCCESS; - read_block->mapping_state = mapping_state; - - // Read the data using the read block buffer. - result = vdo_reset_bio_with_buffer(vio->bio, read_block->buffer, - vio, read_bio_callback, REQ_OP_READ, - location); - if (result != VDO_SUCCESS) { - continue_vio(vio, result); - return; - } - - vdo_submit_bio(vio->bio, action); -} - -/**********************************************************************/ -static void acknowledge_user_bio(struct bio *bio) -{ - int error = vdo_get_bio_result(bio); - struct vio *vio = (struct vio *) bio->bi_private; - - vdo_count_completed_bios(bio); - if (error == 0) { - acknowledge_data_vio(vio_as_data_vio(vio)); - return; - } - - continue_vio(vio, error); -} - -/**********************************************************************/ -void read_data_vio(struct data_vio *data_vio) -{ - struct vio *vio = data_vio_as_vio(data_vio); - struct bio *bio = vio->bio; - int result = VDO_SUCCESS; - int opf = (data_vio->user_bio->bi_opf & - PASSTHROUGH_FLAGS); - - ASSERT_LOG_ONLY(!is_write_vio(vio), - "operation set correctly for data read"); - - if (vdo_is_state_compressed(data_vio->mapped.state)) { - vdo_read_block(data_vio, - data_vio->mapped.pbn, - data_vio->mapped.state, - BIO_Q_ACTION_COMPRESSED_DATA, - read_data_vio_read_block_callback); - return; - } - - // Read into the data block (for a RMW or partial IO) or directly into - // the user buffer (for a 4k read). - if (is_read_modify_write_vio(data_vio_as_vio(data_vio)) || - (data_vio->is_partial)) { - result = vdo_reset_bio_with_buffer(bio, data_vio->data_block, - vio, - vdo_complete_async_bio, - REQ_OP_READ | opf, - data_vio->mapped.pbn); - } else { - // A full 4k read. - vdo_reset_bio_with_user_bio(bio, - data_vio->user_bio, - vio, - acknowledge_user_bio, - REQ_OP_READ | opf, - data_vio->mapped.pbn); - } - - if (result != VDO_SUCCESS) { - continue_vio(vio, result); - return; - } - - vdo_submit_bio(bio, BIO_Q_ACTION_DATA); -} - -/**********************************************************************/ -static void -vdo_acknowledge_and_enqueue(struct vdo_work_item *item) -{ - struct data_vio *data_vio = work_item_as_data_vio(item); - - vdo_acknowledge_data_vio(data_vio); - // Even if we're not using bio-ack threads, we may be in the wrong - // base-code thread. - enqueue_data_vio_callback(data_vio); -} - -/**********************************************************************/ -void acknowledge_data_vio(struct data_vio *data_vio) -{ - struct vdo *vdo = get_vdo_from_data_vio(data_vio); - - // If the remaining discard work is not completely processed by this - // data_vio, don't acknowledge it yet. - if ((data_vio->user_bio != NULL) && - (bio_op(data_vio->user_bio) == REQ_OP_DISCARD) && - (data_vio->remaining_discard > - (VDO_BLOCK_SIZE - data_vio->offset))) { - invoke_vdo_completion_callback(data_vio_as_completion(data_vio)); - return; - } - - // We've finished with the vio; acknowledge completion of the bio to - // the kernel. - if (use_bio_ack_queue(vdo)) { - launch_data_vio_on_bio_ack_queue(data_vio, - vdo_acknowledge_and_enqueue, - NULL, - BIO_ACK_Q_ACTION_ACK); - } else { - vdo_acknowledge_and_enqueue(work_item_from_data_vio(data_vio)); - } -} - -/**********************************************************************/ -void write_data_vio(struct data_vio *data_vio) -{ - struct vio *vio = data_vio_as_vio(data_vio); - unsigned int opf = 0; - int result; - - ASSERT_LOG_ONLY(is_write_vio(vio), - "write_data_vio must be passed a write data_vio"); - - - // Write the data from the data block buffer. - result = vdo_reset_bio_with_buffer(vio->bio, data_vio->data_block, - vio, vdo_complete_async_bio, - REQ_OP_WRITE | opf, - data_vio->new_mapped.pbn); - if (result != VDO_SUCCESS) { - continue_vio(vio, result); - return; - } - - vdo_submit_bio(vio->bio, BIO_Q_ACTION_DATA); -} - -/** - * Determines whether the data block buffer is all zeros. - * - * @param data_vio The data_vio to check - * - * @return true is all zeroes, false otherwise - **/ -static inline bool is_zero_block(struct data_vio *data_vio) -{ - unsigned int word_count = VDO_BLOCK_SIZE / sizeof(uint64_t); - unsigned int chunk_count = word_count / 8; - const char *buffer = data_vio->data_block; - /* - * Handle expected common case of even the first word being nonzero, - * without getting into the more expensive (for one iteration) loop - * below. - */ - if (get_unaligned((u64 *) buffer) != 0) { - return false; - } - - STATIC_ASSERT(VDO_BLOCK_SIZE % sizeof(uint64_t) == 0); - - // Unroll to process 64 bytes at a time - while (chunk_count-- > 0) { - uint64_t word0 = get_unaligned((u64 *) buffer); - uint64_t word1 = - get_unaligned((u64 *) (buffer + 1 * sizeof(uint64_t))); - uint64_t word2 = - get_unaligned((u64 *) (buffer + 2 * sizeof(uint64_t))); - uint64_t word3 = - get_unaligned((u64 *) (buffer + 3 * sizeof(uint64_t))); - uint64_t word4 = - get_unaligned((u64 *) (buffer + 4 * sizeof(uint64_t))); - uint64_t word5 = - get_unaligned((u64 *) (buffer + 5 * sizeof(uint64_t))); - uint64_t word6 = - get_unaligned((u64 *) (buffer + 6 * sizeof(uint64_t))); - uint64_t word7 = - get_unaligned((u64 *) (buffer + 7 * sizeof(uint64_t))); - uint64_t or = (word0 | word1 | word2 | word3 | word4 | word5 | - word6 | word7); - // Prevent compiler from using 8*(cmp;jne). - __asm__ __volatile__("" : : "g"(or)); - if (or != 0) { - return false; - } - buffer += 8 * sizeof(uint64_t); - } - word_count %= 8; - - // Unroll to process 8 bytes at a time. - // (Is this still worthwhile?) - while (word_count-- > 0) { - if (get_unaligned((u64 *) buffer) != 0) { - return false; - } - buffer += sizeof(uint64_t); - } - return true; -} - -/**********************************************************************/ -void vdo_apply_partial_write(struct data_vio *data_vio) -{ - struct bio *bio = data_vio->user_bio; - - if (bio_op(bio) != REQ_OP_DISCARD) { - vdo_bio_copy_data_in(bio, data_vio->data_block + data_vio->offset); - } else { - memset(data_vio->data_block + data_vio->offset, '\0', - min_t(uint32_t, data_vio->remaining_discard, - VDO_BLOCK_SIZE - data_vio->offset)); - - } - - data_vio->is_zero_block = is_zero_block(data_vio); -} - -/**********************************************************************/ -void zero_data_vio(struct data_vio *data_vio) -{ - ASSERT_LOG_ONLY(!is_write_vio(data_vio_as_vio(data_vio)), - "only attempt to zero non-writes"); - if (data_vio->is_partial) { - memset(data_vio->data_block, 0, VDO_BLOCK_SIZE); - } else { - zero_fill_bio(data_vio->user_bio); - } -} - -/**********************************************************************/ -void vdo_copy_data(struct data_vio *source, struct data_vio *destination) -{ - ASSERT_LOG_ONLY(is_read_vio(data_vio_as_vio(destination)), - "only copy to a pure read"); - ASSERT_LOG_ONLY(is_write_vio(data_vio_as_vio(source)), - "only copy from a write"); - - if (destination->is_partial) { - memcpy(destination->data_block, source->data_block, - VDO_BLOCK_SIZE); - } else { - vdo_bio_copy_data_out(destination->user_bio, - source->data_block); - } -} - -/**********************************************************************/ -static void vdo_compress_work(struct vdo_work_item *item) -{ - struct data_vio *data_vio = work_item_as_data_vio(item); - char *context = get_work_queue_private_data(); - int size; - - size = LZ4_compress_default(data_vio->data_block, - data_vio->scratch_block, - VDO_BLOCK_SIZE, - VDO_BLOCK_SIZE, - context); - if (size > 0) { - // The scratch block will be used to contain the compressed - // data. - data_vio->compression.data = data_vio->scratch_block; - data_vio->compression.size = size; - } else { - // Use block size plus one as an indicator for uncompressible - // data. - data_vio->compression.size = VDO_BLOCK_SIZE + 1; - } - - enqueue_data_vio_callback(data_vio); -} - -/**********************************************************************/ -void compress_data_vio(struct data_vio *data_vio) -{ - /* - * If the orignal bio was a discard, but we got this far because the - * discard was a partial one (r/m/w), and it is part of a larger - * discard, we cannot compress this vio. We need to make sure the vio - * completes ASAP. - */ - if ((data_vio->user_bio != NULL) && - (bio_op(data_vio->user_bio) == REQ_OP_DISCARD) && - (data_vio->remaining_discard > 0)) { - data_vio->compression.size = VDO_BLOCK_SIZE + 1; - enqueue_data_vio_callback(data_vio); - return; - } - - launch_data_vio_on_cpu_queue(data_vio, vdo_compress_work, - NULL, - CPU_Q_ACTION_COMPRESS_BLOCK); -} - -/** - * Creates a new data_vio structure. A data_vio represents a single logical - * block of data. It is what most VDO operations work with. This function also - * creates a wrapping data_vio structure that is used when we want to - * physically read or write the data associated with the struct data_vio. - * - * @param [in] vdo The vdo - * @param [in] bio The bio from the request the new data_vio - * will service - * @param [in] arrival_jiffies The arrival time of the bio - * @param [out] data_vio_ptr A pointer to hold the new data_vio - * - * @return VDO_SUCCESS or an error - **/ -static int vdo_create_vio_from_bio(struct vdo *vdo, - struct bio *bio, - uint64_t arrival_jiffies, - struct data_vio **data_vio_ptr) -{ - struct data_vio *data_vio = NULL; - struct vio *vio; - struct bio *vio_bio; - int result = alloc_buffer_from_pool(vdo->data_vio_pool, - (void **) &data_vio); - if (result != VDO_SUCCESS) { - return uds_log_error_strerror(result, - "data vio allocation failure"); - } - - vio = data_vio_as_vio(data_vio); - // XXX We save the bio out of the vio so that we don't forget it. - // Maybe we should just not zero that field somehow. - vio_bio = vio->bio; - - // Zero out the fields which don't need to be preserved (i.e. which - // are not pointers to separately allocated objects). - memset(data_vio, 0, offsetof(struct data_vio, dedupe_context)); - memset(&data_vio->dedupe_context.pending_list, 0, - sizeof(struct list_head)); - - - data_vio->user_bio = bio; - initialize_vio(vio, - vio_bio, - VIO_TYPE_DATA, - VIO_PRIORITY_DATA, - NULL, - vdo, - NULL); - data_vio->offset = sector_to_block_offset(bio->bi_iter.bi_sector); - data_vio->is_partial = ((bio->bi_iter.bi_size < VDO_BLOCK_SIZE) || - (data_vio->offset != 0)); - - if (data_vio->is_partial) { - vdo_count_bios(&vdo->stats.bios_in_partial, bio); - } else { - /* - * Note that we unconditionally fill in the data_block array - * for non-read operations. There are places like vdo_copy_vio - * that may look at vio->data_block for a zero block (and maybe - * for discards?). We could skip filling in data_block for such - * cases, but only once we're sure all such places are fixed to - * check the is_zero_block flag first. - */ - if (bio_op(bio) == REQ_OP_DISCARD) { - /* - * This is a discard/trim operation. This is treated - * much like the zero block, but we keep differen - * stats and distinguish it in the block map. - */ - memset(data_vio->data_block, 0, VDO_BLOCK_SIZE); - } else if (bio_data_dir(bio) == WRITE) { - // Copy the bio data to a char array so that we can - // continue to use the data after we acknowledge the - // bio. - vdo_bio_copy_data_in(bio, data_vio->data_block); - data_vio->is_zero_block = is_zero_block(data_vio); - } - } - - if (data_vio->is_partial || (bio_data_dir(bio) == WRITE)) { - data_vio->read_block.data = data_vio->data_block; - } - - *data_vio_ptr = data_vio; - return VDO_SUCCESS; -} - -/**********************************************************************/ -static void launch_data_vio_work(struct vdo_work_item *item) -{ - run_vdo_completion_callback(vio_as_completion(work_item_as_vio(item))); -} - -/** - * Continue discard processing for requests that span multiple physical blocks. - * If all have been processed the vio is completed. If we have already seen - * an error, we skip the rest of the discard and fail immediately. - * - *

Invoked in a request-queue thread after the discard of a block has - * completed. - * - * @param completion A completion representing the discard vio - **/ -static void vdo_continue_discard_vio(struct vdo_completion *completion) -{ - enum vio_operation operation; - struct data_vio *data_vio = as_data_vio(completion); - struct vdo *vdo = get_vdo_from_data_vio(data_vio); - - data_vio->remaining_discard -= - min_t(uint32_t, data_vio->remaining_discard, - VDO_BLOCK_SIZE - data_vio->offset); - if ((completion->result != VDO_SUCCESS) || - (data_vio->remaining_discard == 0)) { - if (data_vio->has_discard_permit) { - limiter_release(&vdo->discard_limiter); - data_vio->has_discard_permit = false; - } - vdo_complete_data_vio(completion); - return; - } - - data_vio->is_partial = (data_vio->remaining_discard < VDO_BLOCK_SIZE); - data_vio->offset = 0; - - if (data_vio->is_partial) { - operation = VIO_READ_MODIFY_WRITE; - } else { - operation = VIO_WRITE; - } - - if (data_vio->user_bio->bi_opf & REQ_FUA) { - operation |= VIO_FLUSH_AFTER; - } - - prepare_data_vio(data_vio, data_vio->logical.lbn + 1, operation, - !data_vio->is_partial, vdo_continue_discard_vio); - enqueue_vio(as_vio(completion), launch_data_vio_work, - completion->callback, REQ_Q_ACTION_MAP_BIO); -} - -/** - * Finish a partial read. - * - * @param completion The partial read vio - **/ -static void vdo_complete_partial_read(struct vdo_completion *completion) -{ - struct data_vio *data_vio = as_data_vio(completion); - - vdo_bio_copy_data_out(data_vio->user_bio, - data_vio->read_block.data + data_vio->offset); - vdo_complete_data_vio(completion); - return; -} - -/**********************************************************************/ -int vdo_launch_data_vio_from_bio(struct vdo *vdo, - struct bio *bio, - uint64_t arrival_jiffies, - bool has_discard_permit) -{ - struct data_vio *data_vio = NULL; - int result; - vdo_action *callback = vdo_complete_data_vio; - enum vio_operation operation = VIO_WRITE; - bool is_trim = false; - logical_block_number_t lbn = - sector_to_block(bio->bi_iter.bi_sector - - vdo->starting_sector_offset); - struct vio *vio; - - - result = vdo_create_vio_from_bio(vdo, - bio, - arrival_jiffies, - &data_vio); - if (unlikely(result != VDO_SUCCESS)) { - uds_log_info("%s: vio allocation failure", __func__); - if (has_discard_permit) { - limiter_release(&vdo->discard_limiter); - } - limiter_release(&vdo->request_limiter); - return map_to_system_error(result); - } - - /* - * Discards behave very differently than other requests when coming in - * from device-mapper. We have to be able to handle any size discards - * and with various sector offsets within a block. - */ - if (bio_op(bio) == REQ_OP_DISCARD) { - data_vio->has_discard_permit = has_discard_permit; - data_vio->remaining_discard = bio->bi_iter.bi_size; - callback = vdo_continue_discard_vio; - if (data_vio->is_partial) { - operation = VIO_READ_MODIFY_WRITE; - } else { - is_trim = true; - } - } else if (data_vio->is_partial) { - if (bio_data_dir(bio) == READ) { - callback = vdo_complete_partial_read; - operation = VIO_READ; - } else { - operation = VIO_READ_MODIFY_WRITE; - } - } else if (bio_data_dir(bio) == READ) { - operation = VIO_READ; - } - - if (data_vio->user_bio->bi_opf & REQ_FUA) { - operation |= VIO_FLUSH_AFTER; - } - - prepare_data_vio(data_vio, lbn, operation, is_trim, callback); - - vio = data_vio_as_vio(data_vio); - enqueue_vio(vio, launch_data_vio_work, - vio_as_completion(vio)->callback, REQ_Q_ACTION_MAP_BIO); - - return VDO_SUCCESS; -} - -/** - * Hash a data_vio and set its chunk name. - * - * @param item The data_vio to be hashed - **/ -static void vdo_hash_data_work(struct vdo_work_item *item) -{ - struct data_vio *data_vio = work_item_as_data_vio(item); - - MurmurHash3_x64_128(data_vio->data_block, VDO_BLOCK_SIZE, 0x62ea60be, - &data_vio->chunk_name); - - enqueue_data_vio_callback(data_vio); -} - -/**********************************************************************/ -void hash_data_vio(struct data_vio *data_vio) -{ - launch_data_vio_on_cpu_queue(data_vio, - vdo_hash_data_work, - NULL, - CPU_Q_ACTION_HASH_BLOCK); -} - -/**********************************************************************/ -void check_data_vio_for_duplication(struct data_vio *data_vio) -{ - ASSERT_LOG_ONLY(!data_vio->is_zero_block, - "zero block not checked for duplication"); - ASSERT_LOG_ONLY(data_vio->new_mapped.state != VDO_MAPPING_STATE_UNMAPPED, - "discard not checked for duplication"); - - if (data_vio_has_allocation(data_vio)) { - post_vdo_dedupe_advice(data_vio); - } else { - // This block has not actually been written (presumably because - // we are full), so attempt to dedupe without posting bogus - // advice. - query_vdo_dedupe_advice(data_vio); - } -} - -/**********************************************************************/ -void vdo_update_dedupe_index(struct data_vio *data_vio) -{ - update_vdo_dedupe_advice(data_vio); -} - -/** - * Implements buffer_free_function. - **/ -static void free_pooled_data_vio(void *data) -{ - free_data_vio((struct data_vio *) UDS_FORGET(data)); -} - -/** - * Allocate a data_vio. This function is the internals of - * make_pooled_data_vio(). - * - * @param data_vio_ptr A pointer to hold the newly allocated data_vio - * - * @return VDO_SUCCESS or an error - **/ -static int allocate_pooled_data_vio(struct data_vio **data_vio_ptr) -{ - struct data_vio *data_vio; - struct vio *vio; - int result = UDS_ALLOCATE(1, struct data_vio, __func__, &data_vio); - - if (result != VDO_SUCCESS) { - return uds_log_error_strerror(result, - "data_vio allocation failure"); - } - - STATIC_ASSERT(VDO_BLOCK_SIZE <= PAGE_SIZE); - result = uds_allocate_memory(VDO_BLOCK_SIZE, 0, "vio data", - &data_vio->data_block); - if (result != VDO_SUCCESS) { - free_data_vio(UDS_FORGET(data_vio)); - return uds_log_error_strerror(result, - "data_vio data allocation failure"); - } - - vio = data_vio_as_vio(data_vio); - result = vdo_create_bio(&vio->bio); - if (result != VDO_SUCCESS) { - free_data_vio(UDS_FORGET(data_vio)); - return uds_log_error_strerror(result, - "data_vio data bio allocation failure"); - } - - result = uds_allocate_memory(VDO_BLOCK_SIZE, 0, "vio read buffer", - &data_vio->read_block.buffer); - if (result != VDO_SUCCESS) { - free_data_vio(UDS_FORGET(data_vio)); - return uds_log_error_strerror(result, - "data_vio read allocation failure"); - } - - result = uds_allocate_memory(VDO_BLOCK_SIZE, 0, "vio scratch", - &data_vio->scratch_block); - if (result != VDO_SUCCESS) { - free_data_vio(UDS_FORGET(data_vio)); - return uds_log_error_strerror(result, - "data_vio scratch allocation failure"); - } - - *data_vio_ptr = data_vio; - return VDO_SUCCESS; -} - -/** - * Implements buffer_allocate_function. - **/ -static int make_pooled_data_vio(void **data_ptr) -{ - struct data_vio *data_vio = NULL; - int result = allocate_pooled_data_vio(&data_vio); - if (result != VDO_SUCCESS) { - free_pooled_data_vio(data_vio); - return result; - } - - *data_ptr = data_vio; - return VDO_SUCCESS; -} - -/** - * Dump out the waiters on each data_vio in the data_vio buffer pool. - * - * @param queue The queue to check (logical or physical) - * @param wait_on The label to print for queue (logical or physical) - **/ -static void dump_vio_waiters(struct wait_queue *queue, char *wait_on) -{ - struct waiter *waiter, *first = get_first_waiter(queue); - struct data_vio *data_vio; - - if (first == NULL) { - return; - } - - data_vio = waiter_as_data_vio(first); - - uds_log_info(" %s is locked. Waited on by: vio %px pbn %llu lbn %llu d-pbn %llu lastOp %s", - wait_on, data_vio, get_data_vio_allocation(data_vio), - data_vio->logical.lbn, data_vio->duplicate.pbn, - get_data_vio_operation_name(data_vio)); - - - for (waiter = first->next_waiter; waiter != first; - waiter = waiter->next_waiter) { - data_vio = waiter_as_data_vio(waiter); - uds_log_info(" ... and : vio %px pbn %llu lbn %llu d-pbn %llu lastOp %s", - data_vio, get_data_vio_allocation(data_vio), - data_vio->logical.lbn, data_vio->duplicate.pbn, - get_data_vio_operation_name(data_vio)); - } -} - -/** - * Encode various attributes of a data_vio as a string of one-character flags - * for dump logging. This encoding is for logging brevity: - * - * R => vio completion result not VDO_SUCCESS - * W => vio is on a wait queue - * D => vio is a duplicate - * - *

The common case of no flags set will result in an empty, null-terminated - * buffer. If any flags are encoded, the first character in the string will be - * a space character. - * - * @param data_vio The vio to encode - * @param buffer The buffer to receive a null-terminated string of encoded - * flag character - **/ -static void encode_vio_dump_flags(struct data_vio *data_vio, char buffer[8]) -{ - char *p_flag = buffer; - *p_flag++ = ' '; - if (data_vio_as_completion(data_vio)->result != VDO_SUCCESS) { - *p_flag++ = 'R'; - } - if (data_vio_as_allocating_vio(data_vio)->waiter.next_waiter != NULL) { - *p_flag++ = 'W'; - } - if (data_vio->is_duplicate) { - *p_flag++ = 'D'; - } - if (p_flag == &buffer[1]) { - // No flags, so remove the blank space. - p_flag = buffer; - } - *p_flag = '\0'; -} - -/** - * Dump out info on a data_vio from the data_vio pool. - * - *

Implements buffer_dump_function. - * - * @param data The data_vio to dump - **/ -static void dump_pooled_data_vio(void *data) -{ - struct data_vio *data_vio = (struct data_vio *) data; - - /* - * This just needs to be big enough to hold a queue (thread) name - * and a function name (plus a separator character and NUL). The - * latter is limited only by taste. - * - * In making this static, we're assuming only one "dump" will run at - * a time. If more than one does run, the log output will be garbled - * anyway. - */ - static char vio_work_item_dump_buffer[100 + MAX_QUEUE_NAME_LEN]; - // Another static buffer... - // log10(256) = 2.408+, round up: - enum { DIGITS_PER_UINT64_T = (int) (1 + 2.41 * sizeof(uint64_t)) }; - static char vio_block_number_dump_buffer[sizeof("P L D") - + 3 * DIGITS_PER_UINT64_T]; - static char vio_flush_generation_buffer[sizeof(" FG") - + DIGITS_PER_UINT64_T] = ""; - static char flags_dump_buffer[8]; - - /* - * We're likely to be logging a couple thousand of these lines, and - * in some circumstances syslogd may have trouble keeping up, so - * keep it BRIEF rather than user-friendly. - */ - dump_work_item_to_buffer(work_item_from_data_vio(data_vio), - vio_work_item_dump_buffer, - sizeof(vio_work_item_dump_buffer)); - if (data_vio->is_duplicate) { - snprintf(vio_block_number_dump_buffer, - sizeof(vio_block_number_dump_buffer), - "P%llu L%llu D%llu", - get_data_vio_allocation(data_vio), - data_vio->logical.lbn, - data_vio->duplicate.pbn); - } else if (data_vio_has_allocation(data_vio)) { - snprintf(vio_block_number_dump_buffer, - sizeof(vio_block_number_dump_buffer), - "P%llu L%llu", - get_data_vio_allocation(data_vio), - data_vio->logical.lbn); - } else { - snprintf(vio_block_number_dump_buffer, - sizeof(vio_block_number_dump_buffer), "L%llu", - data_vio->logical.lbn); - } - - if (data_vio->flush_generation != 0) { - snprintf(vio_flush_generation_buffer, - sizeof(vio_flush_generation_buffer), " FG%llu", - data_vio->flush_generation); - } - - // Encode vio attributes as a string of one-character flags, usually - // empty. - encode_vio_dump_flags(data_vio, flags_dump_buffer); - - uds_log_info(" vio %px %s%s %s %s%s", data_vio, - vio_block_number_dump_buffer, vio_flush_generation_buffer, - get_data_vio_operation_name(data_vio), - vio_work_item_dump_buffer, - flags_dump_buffer); - // might want info on: wantUDSAnswer / operation / status - // might want info on: bio / bios_merged - - dump_vio_waiters(&data_vio->logical.waiters, "lbn"); - - // might want to dump more info from vio here -} - -/**********************************************************************/ -int make_data_vio_buffer_pool(uint32_t pool_size, - struct buffer_pool **buffer_pool_ptr) -{ - return make_buffer_pool("data_vio pool", - pool_size, - make_pooled_data_vio, - free_pooled_data_vio, - dump_pooled_data_vio, - buffer_pool_ptr); -} - -/**********************************************************************/ -struct data_location vdo_get_dedupe_advice(const struct dedupe_context *context) -{ - struct data_vio *data_vio = container_of(context, - struct data_vio, - dedupe_context); - return (struct data_location) { - .state = data_vio->new_mapped.state, - .pbn = data_vio->new_mapped.pbn, - }; -} - -/**********************************************************************/ -void vdo_set_dedupe_advice(struct dedupe_context *context, - const struct data_location *advice) -{ - receive_data_vio_dedupe_advice(container_of(context, - struct data_vio, - dedupe_context), - advice); -} diff --git a/vdo/dataKVIO.h b/vdo/dataKVIO.h deleted file mode 100644 index 167ef651..00000000 --- a/vdo/dataKVIO.h +++ /dev/null @@ -1,190 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/kernel/dataKVIO.h#23 $ - */ - -#ifndef DATA_KVIO_H -#define DATA_KVIO_H - -#include - -#include "uds.h" - -#include "dataVIO.h" - -#include "kernelVDO.h" -#include "kvio.h" - -/** - * Returns a pointer to the data_vio wrapping a work item. - * - * @param item the work item - * - * @return the data_vio - **/ -static inline struct data_vio * -work_item_as_data_vio(struct vdo_work_item *item) -{ - return vio_as_data_vio(work_item_as_vio(item)); -} - -/** - * Get the work_item from a data_vio. - * - * @param data_vio The data_vio - * - * @return the data_vio's work item - **/ -static inline struct vdo_work_item * -work_item_from_data_vio(struct data_vio *data_vio) -{ - return work_item_from_vio(data_vio_as_vio(data_vio)); -} - -/** - * Set up and enqueue a data_vio on the CPU queue. - * - * @param data_vio The data_vio to set up - * @param work The function pointer to execute - * @param stats_function A function pointer to record for stats, or NULL - * @param action Action code, mapping to a relative priority - **/ -static inline void launch_data_vio_on_cpu_queue(struct data_vio *data_vio, - vdo_work_function work, - void *stats_function, - unsigned int action) -{ - struct vio *vio = data_vio_as_vio(data_vio); - launch_vio(vio, work, stats_function, action, vio->vdo->cpu_queue); -} - -/** - * Set up and enqueue a data_vio on the bio Ack queue. - * - * @param data_vio The data_vio to set up - * @param work The function pointer to execute - * @param stats_function A function pointer to record for stats, or NULL - * @param action Action code, mapping to a relative priority - **/ -static inline void -launch_data_vio_on_bio_ack_queue(struct data_vio *data_vio, - vdo_work_function work, - void *stats_function, - unsigned int action) -{ - struct vio *vio = data_vio_as_vio(data_vio); - launch_vio(vio, work, stats_function, action, vio->vdo->bio_ack_queue); -} - -/** - * Move a data_vio back to the base threads. - * - * @param data_vio The data_vio to enqueue - **/ -static inline void enqueue_data_vio_callback(struct data_vio *data_vio) -{ - enqueue_vio_callback(data_vio_as_vio(data_vio)); -} - -/** - * Associate a vio with a bio passed in from the block layer, and start - * processing the vio. - * - * If setting up a vio fails, a message is logged, and the limiter permits - * (request and maybe discard) released, but the caller is responsible for - * disposing of the bio. - * - * @param vdo The vdo - * @param bio The bio for which to create vio - * @param arrival_jiffies The time (in jiffies) when the external request - * entered the device mapbio function - * @param has_discard_permit Whether we got a permit from the discard - * limiter of the kernel layer - * - * @return VDO_SUCCESS or a system error code - **/ -int __must_check vdo_launch_data_vio_from_bio(struct vdo *vdo, - struct bio *bio, - uint64_t arrival_jiffies, - bool has_discard_permit); - -/** - * Return a batch of data_vio objects to the pool. - * - *

Implements batch_processor_callback. - * - * @param batch The batch processor - * @param closure The kernal layer - **/ -void return_data_vio_batch_to_pool(struct batch_processor *batch, - void *closure); - -/** - * Fetch the data for a block from storage. The fetched data will be - * uncompressed when the callback is called, and the result of the read - * operation will be stored in the read_block's status field. On success, - * the data will be in the read_block's data pointer. - * - * @param data_vio The data_vio to read a block in for - * @param location The physical block number to read from - * @param mapping_state The mapping state of the block to read - * @param action The bio queue action - * @param callback The function to call when the read is done - **/ -void vdo_read_block(struct data_vio *data_vio, - physical_block_number_t location, - enum block_mapping_state mapping_state, - enum bio_q_action action, - vdo_action *callback); - -/** - * Allocate a buffer pool of data_vio objects. - * - * @param [in] pool_size The number of data_vio objects in the pool - * @param [out] buffer_pool_ptr A pointer to hold the new buffer pool - * - * @return VDO_SUCCESS or an error - **/ -int __must_check -make_data_vio_buffer_pool(uint32_t pool_size, - struct buffer_pool **buffer_pool_ptr); - -/** - * Get the state needed to generate UDS metadata from the data_vio - * associated with a dedupe_context. - * - * @param context The dedupe_context - * - * @return the advice to store in the UDS index - **/ -struct data_location __must_check -vdo_get_dedupe_advice(const struct dedupe_context *context); - -/** - * Set the result of a dedupe query for the data_vio associated with a - * dedupe_context. - * - * @param context The context receiving advice - * @param advice A data location at which the chunk named in the context - * might be stored (will be NULL if no advice was found) - **/ -void vdo_set_dedupe_advice(struct dedupe_context *context, - const struct data_location *advice); - -#endif /* DATA_KVIO_H */ diff --git a/vdo/dataVIO.c b/vdo/dataVIO.c deleted file mode 100644 index 0eb00414..00000000 --- a/vdo/dataVIO.c +++ /dev/null @@ -1,404 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/dataVIO.c#41 $ - */ - -#include "dataVIO.h" - -#include "logger.h" -#include "memoryAlloc.h" -#include "permassert.h" - -#include "bio.h" -#include "blockMap.h" -#include "compressionState.h" -#include "extent.h" -#include "logicalZone.h" -#include "threadConfig.h" -#include "vdoInternal.h" -#include "vioRead.h" -#include "vioWrite.h" - -static const char *ASYNC_OPERATION_NAMES[] = { - "launch", - "acknowledge_write", - "acquire_vdo_hash_lock", - "attempt_logical_block_lock", - "lock_duplicate_pbn", - "check_for_duplication", - "compress_data_vio", - "find_block_map_slot", - "get_mapped_block/for_read", - "get_mapped_block/for_dedupe", - "get_mapped_block/for_write", - "hash_data_vio", - "journal_decrement_for_dedupe", - "journal_decrement_for_write", - "journal_increment_for_compression", - "journal_increment_for_dedupe", - "journal_increment_for_write", - "journal_mapping_for_compression", - "journal_mapping_for_dedupe", - "journal_mapping_for_write", - "journal_unmapping_for_dedupe", - "journal_unmapping_for_write", - "vdo_attempt_packing", - "put_mapped_block/for_write", - "put_mapped_block/for_dedupe", - "read_data_vio", - "update_dedupe_index", - "verify_duplication", - "write_data_vio", -}; - -/** - * Initialize the LBN lock of a data_vio. In addition to recording the LBN on - * which the data_vio will operate, it will also find the logical zone - * associated with the LBN. - * - * @param data_vio The data_vio to initialize - * @param lbn The lbn on which the data_vio will operate - **/ -static void initialize_lbn_lock(struct data_vio *data_vio, - logical_block_number_t lbn) -{ - struct vdo *vdo = get_vdo_from_data_vio(data_vio); - struct lbn_lock *lock = &data_vio->logical; - - lock->lbn = lbn; - lock->locked = false; - initialize_wait_queue(&lock->waiters); - - lock->zone = get_vdo_logical_zone(vdo->logical_zones, - vdo_compute_logical_zone(data_vio)); -} - -/**********************************************************************/ -void prepare_data_vio(struct data_vio *data_vio, - logical_block_number_t lbn, - enum vio_operation operation, - bool is_trim, - vdo_action *callback) -{ - struct vio *vio = data_vio_as_vio(data_vio); - - // Clearing the tree lock must happen before initializing the LBN lock, - // which also adds information to the tree lock. - memset(&data_vio->tree_lock, 0, sizeof(data_vio->tree_lock)); - initialize_lbn_lock(data_vio, lbn); - INIT_LIST_HEAD(&data_vio->hash_lock_entry); - INIT_LIST_HEAD(&data_vio->write_entry); - - vio_reset_allocation(data_vio_as_allocating_vio(data_vio)); - - data_vio->is_duplicate = false; - - memset(&data_vio->chunk_name, 0, sizeof(data_vio->chunk_name)); - memset(&data_vio->duplicate, 0, sizeof(data_vio->duplicate)); - - vio->operation = operation; - vio->callback = callback; - - data_vio->mapped.state = VDO_MAPPING_STATE_UNCOMPRESSED; - data_vio->new_mapped.state = (is_trim ? VDO_MAPPING_STATE_UNMAPPED : - VDO_MAPPING_STATE_UNCOMPRESSED); - reset_vdo_completion(vio_as_completion(vio)); - set_data_vio_logical_callback(data_vio, - vdo_attempt_logical_block_lock); -} - -/**********************************************************************/ -void complete_data_vio(struct vdo_completion *completion) -{ - struct data_vio *data_vio = as_data_vio(completion); - if (completion->result != VDO_SUCCESS) { - struct vio *vio = data_vio_as_vio(data_vio); - char vio_operation[VDO_VIO_OPERATION_DESCRIPTION_MAX_LENGTH]; - get_vio_operation_description(vio, vio_operation); - update_vio_error_stats(vio, - "Completing %s vio for LBN %llu with error after %s", - vio_operation, - (unsigned long long) data_vio->logical.lbn, - get_data_vio_operation_name(data_vio)); - } - - if (is_read_data_vio(data_vio)) { - cleanup_read_data_vio(data_vio); - } else { - cleanup_write_data_vio(data_vio); - } -} - -/**********************************************************************/ -void finish_data_vio(struct data_vio *data_vio, int result) -{ - struct vdo_completion *completion = data_vio_as_completion(data_vio); - set_vdo_completion_result(completion, result); - complete_data_vio(completion); -} - -/**********************************************************************/ -const char *get_data_vio_operation_name(struct data_vio *data_vio) -{ - STATIC_ASSERT((MAX_VIO_ASYNC_OPERATION_NUMBER - - MIN_VIO_ASYNC_OPERATION_NUMBER) == - COUNT_OF(ASYNC_OPERATION_NAMES)); - - return ((data_vio->last_async_operation < - MAX_VIO_ASYNC_OPERATION_NUMBER) ? - ASYNC_OPERATION_NAMES[data_vio->last_async_operation] : - "unknown async operation"); -} - -/**********************************************************************/ -void receive_data_vio_dedupe_advice(struct data_vio *data_vio, - const struct data_location *advice) -{ - /* - * NOTE: this is called on non-base-code threads. Be very careful to - * not do anything here that needs a base code thread-local variable, - * such as trying to get the current thread ID, or that does a lot of - * work. - */ - - struct vdo *vdo = get_vdo_from_data_vio(data_vio); - struct zoned_pbn duplicate = - vdo_validate_dedupe_advice(vdo, advice, data_vio->logical.lbn); - set_data_vio_duplicate_location(data_vio, duplicate); -} - -/**********************************************************************/ -void set_data_vio_duplicate_location(struct data_vio *data_vio, - const struct zoned_pbn source) -{ - data_vio->is_duplicate = (source.pbn != VDO_ZERO_BLOCK); - data_vio->duplicate = source; -} - -/**********************************************************************/ -void clear_data_vio_mapped_location(struct data_vio *data_vio) -{ - data_vio->mapped = (struct zoned_pbn){ - .state = VDO_MAPPING_STATE_UNMAPPED, - }; -} - -/**********************************************************************/ -int set_data_vio_mapped_location(struct data_vio *data_vio, - physical_block_number_t pbn, - enum block_mapping_state state) -{ - struct physical_zone *zone; - int result = get_physical_zone(get_vdo_from_data_vio(data_vio), pbn, - &zone); - if (result != VDO_SUCCESS) { - return result; - } - - data_vio->mapped = (struct zoned_pbn){ - .pbn = pbn, - .state = state, - .zone = zone, - }; - return VDO_SUCCESS; -} - -/** - * Launch a request which has acquired an LBN lock. - * - * @param data_vio The data_vio which has just acquired a lock - **/ -static void launch_locked_request(struct data_vio *data_vio) -{ - data_vio->logical.locked = true; - - if (is_write_data_vio(data_vio)) { - launch_write_data_vio(data_vio); - } else { - launch_read_data_vio(data_vio); - } -} - -/**********************************************************************/ -void vdo_attempt_logical_block_lock(struct vdo_completion *completion) -{ - struct data_vio *data_vio = as_data_vio(completion); - struct lbn_lock *lock = &data_vio->logical; - struct vdo *vdo = get_vdo_from_data_vio(data_vio); - struct data_vio *lock_holder; - int result; - - assert_data_vio_in_logical_zone(data_vio); - - if (data_vio->logical.lbn >= vdo->states.vdo.config.logical_blocks) { - finish_data_vio(data_vio, VDO_OUT_OF_RANGE); - return; - } - - result = int_map_put(get_vdo_logical_zone_lbn_lock_map(lock->zone), - lock->lbn, data_vio, false, - (void **) &lock_holder); - if (result != VDO_SUCCESS) { - finish_data_vio(data_vio, result); - return; - } - - if (lock_holder == NULL) { - // We got the lock - launch_locked_request(data_vio); - return; - } - - result = ASSERT(lock_holder->logical.locked, - "logical block lock held"); - if (result != VDO_SUCCESS) { - finish_data_vio(data_vio, result); - return; - } - - /* - * If the new request is a pure read request (not read-modify-write) - * and the lock_holder is writing and has received an allocation - * (VDO-2683), service the read request immediately by copying data - * from the lock_holder to avoid having to flush the write out of the - * packer just to prevent the read from waiting indefinitely. If the - * lock_holder does not yet have an allocation, prevent it from - * blocking in the packer and wait on it. - */ - if (is_read_data_vio(data_vio) && - READ_ONCE(lock_holder->allocation_succeeded)) { - vdo_copy_data(lock_holder, data_vio); - finish_data_vio(data_vio, VDO_SUCCESS); - return; - } - - data_vio->last_async_operation = VIO_ASYNC_OP_ATTEMPT_LOGICAL_BLOCK_LOCK; - result = enqueue_data_vio(&lock_holder->logical.waiters, - data_vio); - if (result != VDO_SUCCESS) { - finish_data_vio(data_vio, result); - return; - } - - // Prevent writes and read-modify-writes from blocking indefinitely on - // lock holders in the packer. - if (!is_read_data_vio(lock_holder) && - cancel_vio_compression(lock_holder)) { - data_vio->compression.lock_holder = lock_holder; - launch_data_vio_packer_callback(data_vio, - remove_lock_holder_from_vdo_packer); - } -} - -/** - * Release an uncontended LBN lock. - * - * @param data_vio The data_vio holding the lock - **/ -static void release_lock(struct data_vio *data_vio) -{ - struct lbn_lock *lock = &data_vio->logical; - struct int_map *lock_map = - get_vdo_logical_zone_lbn_lock_map(lock->zone); - struct data_vio *lock_holder; - - if (!lock->locked) { - // The lock is not locked, so it had better not be registered - // in the lock map. - struct data_vio *lock_holder = int_map_get(lock_map, lock->lbn); - ASSERT_LOG_ONLY((data_vio != lock_holder), - "no logical block lock held for block %llu", - (unsigned long long) lock->lbn); - return; - } - - // Remove the lock from the logical block lock map, releasing the lock. - lock_holder = int_map_remove(lock_map, lock->lbn); - ASSERT_LOG_ONLY((data_vio == lock_holder), - "logical block lock mismatch for block %llu", - (unsigned long long) lock->lbn); - lock->locked = false; - return; -} - -/**********************************************************************/ -void vdo_release_logical_block_lock(struct data_vio *data_vio) -{ - struct data_vio *lock_holder, *next_lock_holder; - struct lbn_lock *lock = &data_vio->logical; - int result; - - assert_data_vio_in_logical_zone(data_vio); - if (!has_waiters(&data_vio->logical.waiters)) { - release_lock(data_vio); - return; - } - - ASSERT_LOG_ONLY(lock->locked, "lbn_lock with waiters is not locked"); - - // Another data_vio is waiting for the lock, so just transfer it in a - // single lock map operation - next_lock_holder = - waiter_as_data_vio(dequeue_next_waiter(&lock->waiters)); - - // Transfer the remaining lock waiters to the next lock holder. - transfer_all_waiters(&lock->waiters, - &next_lock_holder->logical.waiters); - - result = int_map_put(get_vdo_logical_zone_lbn_lock_map(lock->zone), - lock->lbn, next_lock_holder, true, - (void **) &lock_holder); - if (result != VDO_SUCCESS) { - finish_data_vio(next_lock_holder, result); - return; - } - - ASSERT_LOG_ONLY((lock_holder == data_vio), - "logical block lock mismatch for block %llu", - (unsigned long long) lock->lbn); - lock->locked = false; - - /* - * If there are still waiters, other data_vios must be trying to get - * the lock we just transferred. We must ensure that the new lock - * holder doesn't block in the packer. - */ - if (has_waiters(&next_lock_holder->logical.waiters)) { - cancel_vio_compression(next_lock_holder); - } - - // Avoid stack overflow on lock transfer. - // XXX: this is only an issue in the 1 thread config. - data_vio_as_completion(next_lock_holder)->requeue = true; - launch_locked_request(next_lock_holder); -} - -/**********************************************************************/ -void free_data_vio(struct data_vio *data_vio) -{ - if (data_vio == NULL) { - return; - } - - vdo_free_bio(UDS_FORGET(data_vio_as_vio(data_vio)->bio)); - UDS_FREE(UDS_FORGET(data_vio->read_block.buffer)); - UDS_FREE(UDS_FORGET(data_vio->data_block)); - UDS_FREE(UDS_FORGET(data_vio->scratch_block)); - UDS_FREE(UDS_FORGET(data_vio)); -} diff --git a/vdo/dataVIO.h b/vdo/dataVIO.h deleted file mode 100644 index b2ad7cb9..00000000 --- a/vdo/dataVIO.h +++ /dev/null @@ -1,1141 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/dataVIO.h#50 $ - */ - -#ifndef DATA_VIO_H -#define DATA_VIO_H - -#include -#include - -#include "permassert.h" - -#include "allocatingVIO.h" -#include "blockMapEntry.h" -#include "blockMappingState.h" -#include "constants.h" -#include "hashZone.h" -#include "journalPoint.h" -#include "logicalZone.h" -#include "referenceOperation.h" -#include "threadConfig.h" -#include "types.h" -#include "vdo.h" -#include "vdoPageCache.h" -#include "vio.h" -#include "waitQueue.h" - -/** - * Codes for describing the last asynchronous operation performed on a vio. - **/ -enum async_operation_number { - MIN_VIO_ASYNC_OPERATION_NUMBER = 0, - VIO_ASYNC_OP_LAUNCH = MIN_VIO_ASYNC_OPERATION_NUMBER, - VIO_ASYNC_OP_ACKNOWLEDGE_WRITE, - VIO_ASYNC_OP_ACQUIRE_VDO_HASH_LOCK, - VIO_ASYNC_OP_ATTEMPT_LOGICAL_BLOCK_LOCK, - VIO_ASYNC_OP_LOCK_DUPLICATE_PBN, - VIO_ASYNC_OP_CHECK_FOR_DUPLICATION, - VIO_ASYNC_OP_COMPRESS_DATA_VIO, - VIO_ASYNC_OP_FIND_BLOCK_MAP_SLOT, - VIO_ASYNC_OP_GET_MAPPED_BLOCK_FOR_READ, - VIO_ASYNC_OP_GET_MAPPED_BLOCK_FOR_DEDUPE, - VIO_ASYNC_OP_GET_MAPPED_BLOCK_FOR_WRITE, - VIO_ASYNC_OP_HASH_DATA_VIO, - VIO_ASYNC_OP_JOURNAL_DECREMENT_FOR_DEDUPE, - VIO_ASYNC_OP_JOURNAL_DECREMENT_FOR_WRITE, - VIO_ASYNC_OP_JOURNAL_INCREMENT_FOR_COMPRESSION, - VIO_ASYNC_OP_JOURNAL_INCREMENT_FOR_DEDUPE, - VIO_ASYNC_OP_JOURNAL_INCREMENT_FOR_WRITE, - VIO_ASYNC_OP_JOURNAL_MAPPING_FOR_COMPRESSION, - VIO_ASYNC_OP_JOURNAL_MAPPING_FOR_DEDUPE, - VIO_ASYNC_OP_JOURNAL_MAPPING_FOR_WRITE, - VIO_ASYNC_OP_JOURNAL_UNMAPPING_FOR_DEDUPE, - VIO_ASYNC_OP_JOURNAL_UNMAPPING_FOR_WRITE, - VIO_ASYNC_OP_ATTEMPT_PACKING, - VIO_ASYNC_OP_PUT_MAPPED_BLOCK_FOR_WRITE, - VIO_ASYNC_OP_PUT_MAPPED_BLOCK_FOR_DEDUPE, - VIO_ASYNC_OP_READ_DATA_VIO, - VIO_ASYNC_OP_UPDATE_DEDUPE_INDEX, - VIO_ASYNC_OP_VERIFY_DUPLICATION, - VIO_ASYNC_OP_WRITE_DATA_VIO, - MAX_VIO_ASYNC_OPERATION_NUMBER, -} __packed; - -/* - * An LBN lock. - */ -struct lbn_lock { - /* The LBN being locked */ - logical_block_number_t lbn; - /* Whether the lock is locked */ - bool locked; - /* The queue of waiters for the lock */ - struct wait_queue waiters; - /* The logical zone of the LBN */ - struct logical_zone *zone; -}; - -/** - * A position in the arboreal block map at a specific level. - **/ -struct block_map_tree_slot { - page_number_t page_index; - struct block_map_slot block_map_slot; -}; - -/* - * Fields for using the arboreal block map. - */ -struct tree_lock { - /* The current height at which this data_vio is operating */ - height_t height; - /* The block map tree for this LBN */ - root_count_t root_index; - /* Whether we hold a page lock */ - bool locked; - /* The thread on which to run the callback */ - thread_id_t thread_id; - /* The function to call after looking up a block map slot */ - vdo_action *callback; - /* The key for the lock map */ - uint64_t key; - /* - * The queue of waiters for the page this vio is allocating or loading - */ - struct wait_queue waiters; - /* The block map tree slots for this LBN */ - struct block_map_tree_slot tree_slots[VDO_BLOCK_MAP_TREE_HEIGHT + 1]; -}; - -struct compression_state { - /* - * The current compression state of this vio. This field contains a - * value which consists of a vio_compression_state possibly ORed with a - * flag indicating that a request has been made to cancel (or prevent) - * compression for this vio. - * - * This field should be accessed through the - * get_vio_compression_state() and set_vio_compression_state() methods. - * It should not be accessed directly. - */ - atomic_t state; - - /* The compressed size of this block */ - uint16_t size; - - /* - * The packer input or output bin slot which holds the enclosing - * data_vio - */ - slot_number_t slot; - - /* - * The packer input bin to which the enclosing data_vio has been - * assigned - */ - struct input_bin *bin; - - /* A pointer to the compressed form of this block */ - char *data; - - /* - * A vio which is blocked in the packer while holding a lock this vio - * needs. - */ - struct data_vio *lock_holder; -}; - -/* Dedupe support */ -struct dedupe_context { - struct uds_request uds_request; - struct list_head pending_list; - uint64_t submission_jiffies; - atomic_t request_state; - int status; - bool is_pending; -}; - -struct read_block { - /** - * A pointer to a block that holds the data from the last read - * operation. - **/ - char *data; - /** - * Temporary storage for doing reads from the underlying device. - **/ - char *buffer; - /** - * Callback to invoke after completing the read I/O operation. - **/ - vdo_action *callback; - /** - * Mapping state passed to vdo_read_block(), used to determine whether - * the data must be uncompressed. - **/ - enum block_mapping_state mapping_state; - /** - * The result code of the read attempt. - **/ - int status; -}; - -/** - * A vio for processing user data requests. - **/ -struct data_vio { - /* The underlying struct allocating_vio */ - struct allocating_vio allocating_vio; - - /* The logical block of this request */ - struct lbn_lock logical; - - /* The state for traversing the block map tree */ - struct tree_lock tree_lock; - - /* The current partition address of this block */ - struct zoned_pbn mapped; - - /** The hash of this vio (if not zero) */ - struct uds_chunk_name chunk_name; - - /* Used for logging and debugging */ - enum async_operation_number last_async_operation; - - /* The operation to record in the recovery and slab journals */ - struct reference_operation operation; - - /* Whether this vio is a read-and-write vio */ - bool is_partial_write; - - /* Whether this vio contains all zeros */ - bool is_zero_block; - - /* Whether this vio write is a duplicate */ - bool is_duplicate; - - /* - * Whether this vio has received an allocation. This field is examined - * from threads not in the allocation zone. - */ - bool allocation_succeeded; - - /* - * The new partition address of this block after the vio write - * completes - */ - struct zoned_pbn new_mapped; - - /* - * The hash zone responsible for the chunk name (NULL if is_zero_block) - */ - struct hash_zone *hash_zone; - - /* - * The lock this vio holds or shares with other vios with the same data - */ - struct hash_lock *hash_lock; - - /* - * All data_vios sharing a hash lock are kept in a list linking these - * list entries - */ - struct list_head hash_lock_entry; - - /* - * The block number in the partition of the UDS deduplication advice - */ - struct zoned_pbn duplicate; - - /* - * The sequence number of the recovery journal block containing the - * increment entry for this vio. - */ - sequence_number_t recovery_sequence_number; - - /* - * The point in the recovery journal where this write last made an - * entry - */ - struct journal_point recovery_journal_point; - - /* The list of vios in user initiated write requests */ - struct list_head write_entry; - - /* - * A flag indicating that a data write vio has a flush generation lock - */ - bool has_flush_generation_lock; - - /* The generation number of the VDO that this vio belongs to */ - sequence_number_t flush_generation; - - /* The completion to use for fetching block map pages for this vio */ - struct vdo_page_completion page_completion; - - /* All of the fields necessary for the compression path */ - struct compression_state compression; - - /* The user bio that initiated this VIO */ - struct bio *user_bio; - - /* partial block support */ - block_size_t offset; - bool is_partial; - - /* discard support */ - bool has_discard_permit; - uint32_t remaining_discard; - - // Fields beyond this point will not be reset when a pooled data_vio - // is reused. - - /* Dedupe */ - struct dedupe_context dedupe_context; - - /** - * A copy of user data written, so we can do additional processing - * (dedupe, compression) after acknowledging the I/O operation and - * thus losing access to the original data. - * - * Also used as buffer space for read-modify-write cycles when - * emulating smaller-than-blockSize I/O operations. - **/ - char *data_block; - /** A block used as output during compression or uncompression */ - char *scratch_block; - /* For data and verification reads */ - struct read_block read_block; -}; - -/** - * Convert an allocating_vio to a data_vio. - * - * @param allocating_vio The allocating_vio to convert - * - * @return The allocating_vio as a data_vio - **/ -static inline struct data_vio * -allocating_vio_as_data_vio(struct allocating_vio *allocating_vio) -{ - ASSERT_LOG_ONLY((allocating_vio_as_vio(allocating_vio)->type == - VIO_TYPE_DATA), - "allocating_vio is a struct data_vio"); - return container_of(allocating_vio, struct data_vio, allocating_vio); -} - -/** - * Convert a vio to a data_vio. - * - * @param vio The vio to convert - * - * @return The vio as a data_vio - **/ -static inline struct data_vio *vio_as_data_vio(struct vio *vio) -{ - ASSERT_LOG_ONLY((vio->type == VIO_TYPE_DATA), "vio is a data_vio"); - return container_of(container_of(vio, struct allocating_vio, vio), - struct data_vio, - allocating_vio); -} - -/** - * Convert a data_vio to an allocating_vio. - * - * @param data_vio The data_vio to convert - * - * @return The data_vio as an allocating_vio - **/ -static inline -struct allocating_vio *data_vio_as_allocating_vio(struct data_vio *data_vio) -{ - return &data_vio->allocating_vio; -} - -/** - * Convert a data_vio to a vio. - * - * @param data_vio The data_vio to convert - * - * @return The data_vio as a vio - **/ -static inline struct vio *data_vio_as_vio(struct data_vio *data_vio) -{ - return allocating_vio_as_vio(data_vio_as_allocating_vio(data_vio)); -} - -/** - * Convert a generic vdo_completion to a data_vio. - * - * @param completion The completion to convert - * - * @return The completion as a data_vio - **/ -static inline struct data_vio *as_data_vio(struct vdo_completion *completion) -{ - return vio_as_data_vio(as_vio(completion)); -} - -/** - * Convert a data_vio to a generic completion. - * - * @param data_vio The data_vio to convert - * - * @return The data_vio as a completion - **/ -static inline struct vdo_completion * -data_vio_as_completion(struct data_vio *data_vio) -{ - return allocating_vio_as_completion(data_vio_as_allocating_vio(data_vio)); -} - -/** - * Convert a data_vio to a generic wait queue entry. - * - * @param data_vio The data_vio to convert - * - * @return The data_vio as a wait queue entry - **/ -static inline struct waiter *data_vio_as_waiter(struct data_vio *data_vio) -{ - return allocating_vio_as_waiter(data_vio_as_allocating_vio(data_vio)); -} - -/** - * Convert a data_vio's generic wait queue entry back to the data_vio. - * - * @param waiter The wait queue entry to convert - * - * @return The wait queue entry as a data_vio - **/ -static inline struct data_vio *waiter_as_data_vio(struct waiter *waiter) -{ - if (waiter == NULL) { - return NULL; - } - - return allocating_vio_as_data_vio(waiter_as_allocating_vio(waiter)); -} - -/** - * Check whether a data_vio is a read. - * - * @param data_vio The data_vio to check - **/ -static inline bool is_read_data_vio(struct data_vio *data_vio) -{ - return is_read_vio(data_vio_as_vio(data_vio)); -} - -/** - * Check whether a data_vio is a write. - * - * @param data_vio The data_vio to check - **/ -static inline bool is_write_data_vio(struct data_vio *data_vio) -{ - return is_write_vio(data_vio_as_vio(data_vio)); -} - -/** - * Check whether a data_vio is a compressed block write. - * - * @param data_vio The data_vio to check - * - * @return true if the data_vio is a compressed block write - **/ -static inline bool is_compressed_write_data_vio(struct data_vio *data_vio) -{ - return is_compressed_write_vio(data_vio_as_vio(data_vio)); -} - -/** - * Check whether a data_vio is a trim. - * - * @param data_vio The data_vio to check - * - * @return true if the data_vio is a trim - **/ -static inline bool is_trim_data_vio(struct data_vio *data_vio) -{ - return (data_vio->new_mapped.state == VDO_MAPPING_STATE_UNMAPPED); -} - -/** - * Get the location that should be passed to UDS as the new advice for - * where to find the data written by this data_vio. - * - * @param data_vio The write data_vio that is ready to update UDS - * - * @return a data_location containing the advice to store in UDS - **/ -static inline struct data_location -get_data_vio_new_advice(const struct data_vio *data_vio) -{ - return (struct data_location){ - .pbn = data_vio->new_mapped.pbn, - .state = data_vio->new_mapped.state, - }; -} - -/** - * Get the vdo from a data_vio. - * - * @param data_vio The data_vio from which to get the vdo - * - * @return The vdo to which a data_vio belongs - **/ -static inline struct vdo *get_vdo_from_data_vio(struct data_vio *data_vio) -{ - return data_vio_as_vio(data_vio)->vdo; -} - -/** - * Get the struct thread_config from a data_vio. - * - * @param data_vio The data_vio from which to get the struct thread_config - * - * @return The struct thread_config of the vdo to which a data_vio belongs - **/ -static inline const struct thread_config * -get_thread_config_from_data_vio(struct data_vio *data_vio) -{ - return get_vdo_thread_config(get_vdo_from_data_vio(data_vio)); -} - -/** - * Get the allocation of a data_vio. - * - * @param data_vio The data_vio - * - * @return The allocation of the data_vio - **/ -static inline -physical_block_number_t get_data_vio_allocation(struct data_vio *data_vio) -{ - return data_vio_as_allocating_vio(data_vio)->allocation; -} - -/** - * Check whether a data_vio has an allocation. - * - * @param data_vio The data_vio to check - * - * @return true if the data_vio has an allocated block - **/ -static inline bool data_vio_has_allocation(struct data_vio *data_vio) -{ - return (get_data_vio_allocation(data_vio) != VDO_ZERO_BLOCK); -} - -/** - * (Re)initialize a data_vio to have a new logical block number, keeping the - * same parent and other state. This method must be called before using a - * data_vio. - * - * @param data_vio The data_vio to initialize - * @param lbn The logical block number of the data_vio - * @param operation The operation this data_vio will perform - * @param is_trim true if this data_vio is for a trim request - * @param callback The function to call once the vio has completed its - * operation - **/ -void prepare_data_vio(struct data_vio *data_vio, - logical_block_number_t lbn, - enum vio_operation operation, - bool is_trim, - vdo_action *callback); - -/** - * Complete the processing of a data_vio. - * - * @param completion The completion of the vio to complete - **/ -void complete_data_vio(struct vdo_completion *completion); - -/** - * Finish processing a data_vio, possibly due to an error. This function will - * set any error, and then initiate data_vio clean up. - * - * @param data_vio The data_vio to abort - * @param result The result of processing the data_vio - **/ -void finish_data_vio(struct data_vio *data_vio, int result); - -/** - * Continue processing a data_vio that has been waiting for an event, setting - * the result from the event and calling the current callback. - * - * @param data_vio The data_vio to continue - * @param result The current result (will not mask older errors) - **/ -static inline void continue_data_vio(struct data_vio *data_vio, int result) -{ - continue_vdo_completion(data_vio_as_completion(data_vio), result); -} - -/** - * Get the name of the last asynchronous operation performed on a data_vio. - * - * @param data_vio The data_vio in question - * - * @return The name of the last operation performed on the data_vio - **/ -const char * __must_check get_data_vio_operation_name(struct data_vio *data_vio); - -/** - * Add a data_vio to the tail end of a wait queue. The data_vio must not - * already be waiting in a queue. A trace record is also generated for the - * data_vio. - * - * @param queue The queue to which to add the waiter - * @param waiter The data_vio to add to the queue - * - * @return VDO_SUCCESS or an error code - **/ -static inline int __must_check -enqueue_data_vio(struct wait_queue *queue, - struct data_vio *waiter) -{ - return enqueue_waiter(queue, data_vio_as_waiter(waiter)); -} - -/** - * Check that a data_vio is running on the correct thread for its hash zone. - * - * @param data_vio The data_vio in question - **/ -static inline void assert_data_vio_in_hash_zone(struct data_vio *data_vio) -{ - thread_id_t expected = get_vdo_hash_zone_thread_id(data_vio->hash_zone); - thread_id_t thread_id = vdo_get_callback_thread_id(); - // It's odd to use the LBN, but converting the chunk name to hex is a - // bit clunky for an inline, and the LBN better than nothing as an - // identifier. - ASSERT_LOG_ONLY((expected == thread_id), - "data_vio for logical block %llu on thread %u, should be on hash zone thread %u", - (unsigned long long) data_vio->logical.lbn, - thread_id, - expected); -} - -/** - * Set a callback as a hash zone operation. This function presumes that the - * hash_zone field of the data_vio has already been set. - * - * @param data_vio The data_vio for which to set the callback - * @param callback The callback to set - **/ -static inline void -set_data_vio_hash_zone_callback(struct data_vio *data_vio, - vdo_action *callback) -{ - set_vdo_completion_callback(data_vio_as_completion(data_vio), - callback, - get_vdo_hash_zone_thread_id(data_vio->hash_zone)); -} - -/** - * Set a callback as a hash zone operation and invoke it immediately. - * - * @param data_vio The data_vio for which to set the callback - * @param callback The callback to set - **/ -static inline void -launch_data_vio_hash_zone_callback(struct data_vio *data_vio, - vdo_action *callback) -{ - set_data_vio_hash_zone_callback(data_vio, callback); - invoke_vdo_completion_callback(data_vio_as_completion(data_vio)); -} - -/** - * Check that a data_vio is running on the correct thread for its logical zone. - * - * @param data_vio The data_vio in question - **/ -static inline void assert_data_vio_in_logical_zone(struct data_vio *data_vio) -{ - thread_id_t expected = - get_vdo_logical_zone_thread_id(data_vio->logical.zone); - thread_id_t thread_id = vdo_get_callback_thread_id(); - ASSERT_LOG_ONLY((expected == thread_id), - "data_vio for logical block %llu on thread %u, should be on thread %u", - (unsigned long long) data_vio->logical.lbn, - thread_id, - expected); -} - -/** - * Set a callback as a logical block operation. This function presumes that the - * logical.zone field of the data_vio has already been set. - * - * @param data_vio The data_vio for which to set the callback - * @param callback The callback to set - **/ -static inline void -set_data_vio_logical_callback(struct data_vio *data_vio, - vdo_action *callback) -{ - set_vdo_completion_callback(data_vio_as_completion(data_vio), - callback, - get_vdo_logical_zone_thread_id(data_vio->logical.zone)); -} - -/** - * Set a callback as a logical block operation and invoke it immediately. - * - * @param data_vio The data_vio for which to set the callback - * @param callback The callback to set - **/ -static inline void -launch_data_vio_logical_callback(struct data_vio *data_vio, - vdo_action *callback) -{ - set_data_vio_logical_callback(data_vio, callback); - invoke_vdo_completion_callback(data_vio_as_completion(data_vio)); -} - -/** - * Check that a data_vio is running on the correct thread for its allocated - * zone. - * - * @param data_vio The data_vio in question - **/ -static inline void assert_data_vio_in_allocated_zone(struct data_vio *data_vio) -{ - assert_vio_in_physical_zone(data_vio_as_allocating_vio(data_vio)); -} - -/** - * Set a callback as a physical block operation in a data_vio's allocated zone. - * - * @param data_vio The data_vio - * @param callback The callback to set - **/ -static inline void -set_data_vio_allocated_zone_callback(struct data_vio *data_vio, - vdo_action *callback) -{ - vio_set_physical_zone_callback(data_vio_as_allocating_vio(data_vio), - callback); -} - -/** - * Set a callback as a physical block operation in a data_vio's allocated zone - * and queue the data_vio and invoke it immediately. - * - * @param data_vio The data_vio - * @param callback The callback to invoke - **/ -static inline void -launch_data_vio_allocated_zone_callback(struct data_vio *data_vio, - vdo_action *callback) -{ - vio_launch_physical_zone_callback(data_vio_as_allocating_vio(data_vio), - callback); -} - -/** - * Check that a data_vio is running on the correct thread for its duplicate - * zone. - * - * @param data_vio The data_vio in question - **/ -static inline void assert_data_vio_in_duplicate_zone(struct data_vio *data_vio) -{ - thread_id_t expected = - get_vdo_physical_zone_thread_id(data_vio->duplicate.zone); - thread_id_t thread_id = vdo_get_callback_thread_id(); - ASSERT_LOG_ONLY((expected == thread_id), - "data_vio for duplicate physical block %llu on thread %u, should be on thread %u", - (unsigned long long) data_vio->duplicate.pbn, - thread_id, - expected); -} - -/** - * Set a callback as a physical block operation in a data_vio's duplicate zone. - * - * @param data_vio The data_vio - * @param callback The callback to set - **/ -static inline void -set_data_vio_duplicate_zone_callback(struct data_vio *data_vio, - vdo_action *callback) -{ - set_vdo_completion_callback(data_vio_as_completion(data_vio), - callback, - get_vdo_physical_zone_thread_id(data_vio->duplicate.zone)); -} - -/** - * Set a callback as a physical block operation in a data_vio's duplicate zone - * and queue the data_vio and invoke it immediately. - * - * @param data_vio The data_vio - * @param callback The callback to invoke - **/ -static inline void -launch_data_vio_duplicate_zone_callback(struct data_vio *data_vio, - vdo_action *callback) -{ - set_data_vio_duplicate_zone_callback(data_vio, callback); - invoke_vdo_completion_callback(data_vio_as_completion(data_vio)); -} - -/** - * Check that a data_vio is running on the correct thread for its mapped zone. - * - * @param data_vio The data_vio in question - **/ -static inline void assert_data_vio_in_mapped_zone(struct data_vio *data_vio) -{ - thread_id_t expected = - get_vdo_physical_zone_thread_id(data_vio->mapped.zone); - thread_id_t thread_id = vdo_get_callback_thread_id(); - ASSERT_LOG_ONLY((expected == thread_id), - "data_vio for mapped physical block %llu on thread %u, should be on thread %u", - (unsigned long long) data_vio->mapped.pbn, - thread_id, - expected); -} - -/** - * Set a callback as a physical block operation in a data_vio's mapped zone. - * - * @param data_vio The data_vio - * @param callback The callback to set - **/ -static inline void -set_data_vio_mapped_zone_callback(struct data_vio *data_vio, - vdo_action *callback) -{ - set_vdo_completion_callback(data_vio_as_completion(data_vio), - callback, - get_vdo_physical_zone_thread_id(data_vio->mapped.zone)); -} - -/** - * Check that a data_vio is running on the correct thread for its new_mapped - * zone. - * - * @param data_vio The data_vio in question - **/ -static inline void assert_data_vio_in_new_mapped_zone(struct data_vio *data_vio) -{ - thread_id_t expected = - get_vdo_physical_zone_thread_id(data_vio->new_mapped.zone); - thread_id_t thread_id = vdo_get_callback_thread_id(); - ASSERT_LOG_ONLY((expected == thread_id), - "data_vio for new_mapped physical block %llu on thread %u, should be on thread %u", - (unsigned long long) data_vio->new_mapped.pbn, - thread_id, - expected); -} - -/** - * Set a callback as a physical block operation in a data_vio's new_mapped - * zone. - * - * @param data_vio The data_vio - * @param callback The callback to set - **/ -static inline void -set_data_vio_new_mapped_zone_callback(struct data_vio *data_vio, - vdo_action *callback) -{ - set_vdo_completion_callback(data_vio_as_completion(data_vio), - callback, - get_vdo_physical_zone_thread_id(data_vio->new_mapped.zone)); -} - -/** - * Check that a data_vio is running on the journal thread. - * - * @param data_vio The data_vio in question - **/ -static inline void assert_data_vio_in_journal_zone(struct data_vio *data_vio) -{ - thread_id_t journal_thread = - get_thread_config_from_data_vio(data_vio)->journal_thread; - thread_id_t thread_id = vdo_get_callback_thread_id(); - ASSERT_LOG_ONLY((journal_thread == thread_id), - "data_vio for logical block %llu on thread %u, should be on journal thread %u", - (unsigned long long) data_vio->logical.lbn, - thread_id, - journal_thread); -} - -/** - * Set a callback as a journal operation. - * - * @param data_vio The data_vio for which to set the callback - * @param callback The callback to set - **/ -static inline void -set_data_vio_journal_callback(struct data_vio *data_vio, - vdo_action *callback) -{ - thread_id_t journal_thread = - get_thread_config_from_data_vio(data_vio)->journal_thread; - set_vdo_completion_callback(data_vio_as_completion(data_vio), - callback, - journal_thread); -} - -/** - * Set a callback as a journal operation and invoke it immediately. - * - * @param data_vio The data_vio for which to set the callback - * @param callback The callback to set - **/ -static inline void -launch_data_vio_journal_callback(struct data_vio *data_vio, - vdo_action *callback) -{ - set_data_vio_journal_callback(data_vio, callback); - invoke_vdo_completion_callback(data_vio_as_completion(data_vio)); -} - -/** - * Check that a data_vio is running on the packer thread - * - * @param data_vio The data_vio in question - **/ -static inline void assert_data_vio_in_packer_zone(struct data_vio *data_vio) -{ - thread_id_t packer_thread = - get_thread_config_from_data_vio(data_vio)->packer_thread; - thread_id_t thread_id = vdo_get_callback_thread_id(); - ASSERT_LOG_ONLY((packer_thread == thread_id), - "data_vio for logical block %llu on thread %u, should be on packer thread %u", - (unsigned long long) data_vio->logical.lbn, - thread_id, - packer_thread); -} - -/** - * Set a callback as a packer operation. - * - * @param data_vio The data_vio for which to set the callback - * @param callback The callback to set - **/ -static inline void -set_data_vio_packer_callback(struct data_vio *data_vio, - vdo_action *callback) -{ - thread_id_t packer_thread = - get_thread_config_from_data_vio(data_vio)->packer_thread; - set_vdo_completion_callback(data_vio_as_completion(data_vio), - callback, - packer_thread); -} - -/** - * Set a callback as a packer operation and invoke it immediately. - * - * @param data_vio The data_vio for which to set the callback - * @param callback The callback to set - **/ -static inline void -launch_data_vio_packer_callback(struct data_vio *data_vio, - vdo_action *callback) -{ - set_data_vio_packer_callback(data_vio, callback); - invoke_vdo_completion_callback(data_vio_as_completion(data_vio)); -} - -/** - * Check whether the advice received from UDS is a valid data location, - * and if it is, accept it as the location of a potential duplicate of the - * data_vio. - * - * @param data_vio The data_vio that queried UDS - * @param advice A potential location of the data, or NULL for no advice - **/ -void receive_data_vio_dedupe_advice(struct data_vio *data_vio, - const struct data_location *advice); - -/** - * Set the location of the duplicate block for a data_vio, updating the - * is_duplicate and duplicate fields from a zoned_pbn. - * - * @param data_vio The data_vio to modify - * @param source The location of the duplicate - **/ -void set_data_vio_duplicate_location(struct data_vio *data_vio, - const struct zoned_pbn source); - -/** - * Clear a data_vio's mapped block location, setting it to be unmapped. This - * indicates the block map entry for the logical block is either unmapped or - * corrupted. - * - * @param data_vio The data_vio whose mapped block location is to be reset - **/ -void clear_data_vio_mapped_location(struct data_vio *data_vio); - -/** - * Set a data_vio's mapped field to the physical location recorded in the block - * map for the logical block in the vio. - * - * @param data_vio The data_vio whose field is to be set - * @param pbn The physical block number to set - * @param state The mapping state to set - * - * @return VDO_SUCCESS or an error code if the mapping is unusable - **/ -int __must_check set_data_vio_mapped_location(struct data_vio *data_vio, - physical_block_number_t pbn, - enum block_mapping_state state); - -/** - * Attempt to acquire the lock on a logical block. This is the start of the - * path for all external requests. It is registered in prepare_data_vio(). - * - * @param completion The data_vio for an external data request as a completion - **/ -void vdo_attempt_logical_block_lock(struct vdo_completion *completion); - -/** - * Release the lock on the logical block, if any, that a data_vio has acquired. - * - * @param data_vio The data_vio releasing its logical block lock - **/ -void vdo_release_logical_block_lock(struct data_vio *data_vio); - -/** - * A function to asynchronously hash the block data, setting the chunk name of - * the data_vio. This is asynchronous to allow the computation to be done on - * different threads. - * - * @param data_vio The data_vio to hash - **/ -void hash_data_vio(struct data_vio *data_vio); - -/** - * A function to determine whether a block is a duplicate. This function - * expects the 'physical' field of the data_vio to be set to the physical block - * where the block will be written if it is not a duplicate. If the block does - * turn out to be a duplicate, the data_vio's 'isDuplicate' field will be set - * to true, and the data_vio's 'advice' field will be set to the physical - * block and mapping state of the already stored copy of the block. - * - * @param data_vio The data_vio containing the block to check. - **/ -void check_data_vio_for_duplication(struct data_vio *data_vio); - -/** - * A function to verify the duplication advice by examining an already-stored - * data block. This function expects the 'physical' field of the data_vio to be - * set to the physical block where the block will be written if it is not a - * duplicate, and the 'duplicate' field to be set to the physical block and - * mapping state where a copy of the data may already exist. If the block is - * not a duplicate, the data_vio's 'isDuplicate' field will be cleared. - * - * @param data_vio The data_vio containing the block to check. - **/ -void verify_data_vio_duplication(struct data_vio *data_vio); - -/** - * Update the index with new dedupe advice. - * - * @param data_vio The data_vio which needs to change the entry for its data - **/ -void vdo_update_dedupe_index(struct data_vio *data_vio); - -/** - * A function to zero the contents of a non-write data_vio -- a read, or a RMW - * before becoming a write. - * - * @param data_vio The data_vio to zero - **/ -void zero_data_vio(struct data_vio *data_vio); - -/** - * A function to copy the data of a write data_vio into a read data_vio. - * - * @param source The data_vio to copy from - * @param destination The data_vio to copy to - **/ -void vdo_copy_data(struct data_vio *source, struct data_vio *destination); - -/** - * A function to apply a partial write to a data_vio which has completed the - * read portion of a read-modify-write operation. - * - * @param data_vio The data_vio to modify - **/ -void vdo_apply_partial_write(struct data_vio *data_vio); - -/** - * A function to inform the layer that a data_vio's related I/O request can be - * safely acknowledged as complete, even though the data_vio itself may have - * further processing to do. - * - * @param data_vio The data_vio to acknowledge - **/ -void acknowledge_data_vio(struct data_vio *data_vio); - -/** - * A function to compress the data in a data_vio. - * - * @param data_vio The data_vio to compress - **/ -void compress_data_vio(struct data_vio *data_vio); - -/** - * A function to read a single data_vio from the layer. - * - * If the data_vio does not describe a read-modify-write operation, the - * physical layer may safely acknowledge the related user I/O request - * as complete. - * - * @param data_vio The data_vio to read - **/ -void read_data_vio(struct data_vio *data_vio); - -/** - * A function to write a single data_vio to the layer - * - * @param data_vio The data_vio to write - **/ -void write_data_vio(struct data_vio *data_vio); - -/** - * A function to compare the contents of a data_vio to another data_vio. - * - * @param first The first data_vio to compare - * @param second The second data_vio to compare - * - * @return true if the contents of the two DataVIOs are the same - **/ -bool compare_data_vios(struct data_vio *first, struct data_vio *second); - -/** - * Destroy a data_vio. - * - * @param data_vio The data_vio to free - **/ -void free_data_vio(struct data_vio *data_vio); - -#endif // DATA_VIO_H diff --git a/vdo/deadlockQueue.c b/vdo/deadlockQueue.c deleted file mode 100644 index 2d3b733a..00000000 --- a/vdo/deadlockQueue.c +++ /dev/null @@ -1,48 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/kernel/deadlockQueue.c#3 $ - */ - -#include "deadlockQueue.h" - -/**********************************************************************/ -void initialize_vdo_deadlock_queue(struct deadlock_queue *queue) -{ - spin_lock_init(&queue->lock); - bio_list_init(&queue->list); -} - -/**********************************************************************/ -void add_to_vdo_deadlock_queue(struct deadlock_queue *queue, - struct bio *bio, - uint64_t arrival_jiffies) -{ - spin_lock(&queue->lock); - if (bio_list_empty(&queue->list)) { - /* - * If we get more than one pending at once, this will be - * inaccurate for some of them. Oh well. If we've gotten here, - * we're trying to avoid a deadlock; stats are a secondary - * concern. - */ - queue->arrival_jiffies = arrival_jiffies; - } - bio_list_add(&queue->list, bio); - spin_unlock(&queue->lock); -} diff --git a/vdo/deadlockQueue.h b/vdo/deadlockQueue.h deleted file mode 100644 index 5bd504ac..00000000 --- a/vdo/deadlockQueue.h +++ /dev/null @@ -1,96 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/kernel/deadlockQueue.h#3 $ - */ - -#ifndef DEADLOCK_QUEUE_H -#define DEADLOCK_QUEUE_H - -#include - -#include "bio.h" - -/** - * A holding space for incoming bios if we're not able to block until VIOs - * become available to process them. - **/ -struct deadlock_queue { - /* Protection for the other fields. */ - spinlock_t lock; - /* List of bios we had to accept but don't have VIOs for. */ - struct bio_list list; - /* - * Arrival time to use for statistics tracking for the above - * bios, since we haven't the space to store individual - * arrival times for each. - */ - uint64_t arrival_jiffies; -}; - -/** - * Initialize the struct deadlock_queue structure. - * - * @param queue The structure to initialize - **/ -void initialize_vdo_deadlock_queue(struct deadlock_queue *queue); - -/** - * Add an incoming bio to the list of saved-up bios we're not ready to start - * processing yet. - * - * This excess buffering on top of what the caller implements is generally a - * bad idea, and should be used only when necessary, such as to avoid a - * possible deadlock situation. - * - * @param queue The incoming-bio queue structure - * @param bio The new incoming bio to save - * @param arrival_jiffies The arrival time of this new bio - **/ - -void add_to_vdo_deadlock_queue(struct deadlock_queue *queue, - struct bio *bio, - uint64_t arrival_jiffies); - -/** - * Pull an incoming bio off the queue. - * - * The arrival time returned may be incorrect if multiple bios were saved, as - * there is no per-bio storage used, only one saved arrival time for the whole - * queue. - * - * @param [in] queue The incoming-bio queue - * @param [out] arrival_jiffies The arrival time to use for this bio - * - * @return a bio pointer, or NULL if none were queued - **/ -static inline struct bio *poll_vdo_deadlock_queue(struct deadlock_queue *queue, - uint64_t *arrival_jiffies) -{ - struct bio *bio; - - spin_lock(&queue->lock); - bio = bio_list_pop(&queue->list); - if (unlikely(bio != NULL)) { - *arrival_jiffies = queue->arrival_jiffies; - } - spin_unlock(&queue->lock); - return bio; -} - -#endif // DEADLOCK_QUEUE_H diff --git a/vdo/dedupeIndex.c b/vdo/dedupe-index.c similarity index 51% rename from vdo/dedupeIndex.c rename to vdo/dedupe-index.c index 2499ab52..c8b2ba3d 100644 --- a/vdo/dedupeIndex.c +++ b/vdo/dedupe-index.c @@ -1,64 +1,65 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/kernel/dedupeIndex.c#51 $ */ -#include "dedupeIndex.h" +#include "dedupe-index.h" #include #include +#include +#include +#include +#include #include +#include +#include #include #include "logger.h" -#include "memoryAlloc.h" -#include "murmur/MurmurHash3.h" -#include "stringUtils.h" +#include "memory-alloc.h" +#include "string-utils.h" #include "uds.h" +#include "completion.h" +#include "data-vio.h" +#include "kernel-types.h" +#include "types.h" + +/** + * DOC: The dedupe index interface + * + * FIXME: actually write a summary of how this works with UDS. + */ + struct uds_attribute { struct attribute attr; const char *(*show_string)(struct dedupe_index *); }; -enum { UDS_Q_ACTION }; - -// These are the values in the atomic dedupe_context.request_state field +/* + * Possible values stored in the atomic dedupe_context.request_state, + * recording the state of the uds_request member. Note that when the + * state is UR_TIMED_OUT, the uds_request member is still in use. + */ enum { - // The uds_request object is not in use. - UR_IDLE = 0, - // The uds_request object is in use, and VDO is waiting for the result. - UR_BUSY = 1, - // The uds_request object is in use, but has timed out. - UR_TIMED_OUT = 2, + UR_IDLE, + UR_BUSY, + UR_TIMED_OUT, }; +/* + * Possible index states: closed, opened, or transitioning between those two. + */ enum index_state { - // The UDS index is closed - IS_CLOSED = 0, - // The UDS index session is opening or closing - IS_CHANGING = 1, - // The UDS index is open. - IS_OPENED = 2, + IS_CLOSED, + IS_CHANGING, + IS_OPENED, }; -// Data managing the reporting of UDS timeouts +/* + * A structure to manage the reporting of UDS timeouts + */ struct periodic_event_reporter { uint64_t last_reported_value; atomic64_t value; @@ -68,45 +69,44 @@ struct periodic_event_reporter { struct dedupe_index { struct kobject dedupe_directory; - struct registered_thread allocating_thread; - char *index_name; - struct uds_configuration *configuration; - struct uds_parameters uds_params; + struct uds_parameters parameters; struct uds_index_session *index_session; atomic_t active; - // for reporting UDS timeouts struct periodic_event_reporter timeout_reporter; - // This spinlock protects the state fields and the starting of dedupe - // requests. + /* + * This spinlock protects the state fields and the starting of dedupe + * requests. + */ spinlock_t state_lock; - struct vdo_work_item work_item; // protected by state_lock - struct vdo_work_queue *uds_queue; // protected by state_lock - unsigned int maximum; // protected by state_lock - enum index_state index_state; // protected by state_lock - enum index_state index_target; // protected by state_lock - bool changing; // protected by state_lock - bool create_flag; // protected by state_lock - bool dedupe_flag; // protected by state_lock - bool deduping; // protected by state_lock - bool error_flag; // protected by state_lock - bool suspended; // protected by state_lock - // This spinlock protects the pending list, the pending flag in each - // vio, and the timeout list. + struct vdo_completion completion; /* protected by state_lock */ + struct vdo_work_queue *uds_queue; /* protected by state_lock */ + unsigned int maximum; /* protected by state_lock */ + enum index_state index_state; /* protected by state_lock */ + enum index_state index_target; /* protected by state_lock */ + bool changing; /* protected by state_lock */ + bool create_flag; /* protected by state_lock */ + bool dedupe_flag; /* protected by state_lock */ + bool deduping; /* protected by state_lock */ + bool error_flag; /* protected by state_lock */ + bool suspended; /* protected by state_lock */ + + /* + * This spinlock protects the pending list, the pending flag in each + * vio, and the timeout list. + */ spinlock_t pending_lock; - struct list_head pending_head; // protected by pending_lock - struct timer_list pending_timer; // protected by pending_lock - bool started_timer; // protected by pending_lock + struct list_head pending_head; /* protected by pending_lock */ + struct timer_list pending_timer; /* protected by pending_lock */ + bool started_timer; /* protected by pending_lock */ }; -// Version 1: user space UDS index (limited to 32 bytes) -// Version 2: kernel space UDS index (limited to 16 bytes) +/* Version 2 uses the kernel space UDS index and is limited to 16 bytes */ enum { UDS_ADVICE_VERSION = 2, - // version byte + state byte + 64-bit little-endian PBN + /* version byte + state byte + 64-bit little-endian PBN */ UDS_ADVICE_SIZE = 1 + 1 + sizeof(uint64_t), }; -// We want to ensure that there is only one copy of the following constants. static const char *CLOSED = "closed"; static const char *CLOSING = "closing"; static const char *ERROR = "error"; @@ -116,15 +116,21 @@ static const char *OPENING = "opening"; static const char *SUSPENDED = "suspended"; static const char *UNKNOWN = "unknown"; -// These times are in milliseconds, and these are the default values. +/* These are in milliseconds. */ unsigned int vdo_dedupe_index_timeout_interval = 5000; unsigned int vdo_dedupe_index_min_timer_interval = 100; - -// These times are in jiffies +/* Same two variables, in jiffies for easier consumption. */ static uint64_t vdo_dedupe_index_timeout_jiffies; static uint64_t vdo_dedupe_index_min_timer_jiffies; -/**********************************************************************/ +static inline struct dedupe_index * +as_dedupe_index(struct vdo_completion *completion) +{ + vdo_assert_completion_type(completion->type, + VDO_DEDUPE_INDEX_COMPLETION); + return container_of(completion, struct dedupe_index, completion); +} + static const char *index_state_to_string(struct dedupe_index *index, enum index_state state) { @@ -134,53 +140,29 @@ static const char *index_state_to_string(struct dedupe_index *index, switch (state) { case IS_CLOSED: - // Closed. The error_flag tells if it is because of an error. return index->error_flag ? ERROR : CLOSED; case IS_CHANGING: - // The index_target tells if we are opening or closing the - // index. return index->index_target == IS_OPENED ? OPENING : CLOSING; case IS_OPENED: - // Opened. The dedupe_flag tells if we are online or offline. return index->dedupe_flag ? ONLINE : OFFLINE; default: return UNKNOWN; } } -/** - * Encode VDO duplicate advice into the new_metadata field of a UDS request. - * - * @param request The UDS request to receive the encoding - * @param advice The advice to encode - **/ -static void encode_uds_advice(struct uds_request *request, - struct data_location advice) -{ - size_t offset = 0; - struct uds_chunk_data *encoding = &request->new_metadata; - - encoding->data[offset++] = UDS_ADVICE_VERSION; - encoding->data[offset++] = advice.state; - put_unaligned_le64(advice.pbn, &encoding->data[offset]); - offset += sizeof(uint64_t); - BUG_ON(offset != UDS_ADVICE_SIZE); -} - -/** +/* * Decode VDO duplicate advice from the old_metadata field of a UDS request. - * - * @param request The UDS request containing the encoding - * @param advice The data_location to receive the decoded advice - * - * @return true if valid advice was found and decoded - **/ -static bool decode_uds_advice(const struct uds_request *request, - struct data_location *advice) + * Returns true if valid advice was found and decoded + */ +static bool decode_uds_advice(struct data_vio *data_vio, + const struct uds_request *request) { size_t offset = 0; const struct uds_chunk_data *encoding = &request->old_metadata; + struct vdo *vdo = vdo_from_data_vio(data_vio); + struct zoned_pbn *advice = &data_vio->duplicate; byte version; + int result; if ((request->status != UDS_SUCCESS) || !request->found) { return false; @@ -195,35 +177,50 @@ static bool decode_uds_advice(const struct uds_request *request, advice->state = encoding->data[offset++]; advice->pbn = get_unaligned_le64(&encoding->data[offset]); offset += sizeof(uint64_t); - BUG_ON(offset != UDS_ADVICE_SIZE); + + /* Don't use advice that's clearly meaningless. */ + if ((advice->state == VDO_MAPPING_STATE_UNMAPPED) || + (advice->pbn == VDO_ZERO_BLOCK)) { + uds_log_debug("Invalid advice from deduplication server: pbn %llu, state %u. Giving up on deduplication of logical block %llu", + (unsigned long long) advice->pbn, + advice->state, + (unsigned long long) data_vio->logical.lbn); + atomic64_inc(&vdo->stats.invalid_advice_pbn_count); + return false; + } + + result = vdo_get_physical_zone(vdo, advice->pbn, &advice->zone); + if ((result != VDO_SUCCESS) || (advice->zone == NULL)) { + uds_log_debug("Invalid physical block number from deduplication server: %llu, giving up on deduplication of logical block %llu", + (unsigned long long) advice->pbn, + (unsigned long long) data_vio->logical.lbn); + atomic64_inc(&vdo->stats.invalid_advice_pbn_count); + return false; + } + return true; } -/** +/* * Calculate the actual end of a timer, taking into account the absolute start * time and the present time. - * - * @param start_jiffies The absolute start time, in jiffies - * - * @return the absolute end time for the timer, in jiffies - **/ + */ static uint64_t get_dedupe_index_timeout(uint64_t start_jiffies) { return max(start_jiffies + vdo_dedupe_index_timeout_jiffies, jiffies + vdo_dedupe_index_min_timer_jiffies); } -/**********************************************************************/ -void set_vdo_dedupe_index_timeout_interval(unsigned int value) +void vdo_set_dedupe_index_timeout_interval(unsigned int value) { uint64_t alb_jiffies; - // Arbitrary maximum value is two minutes + /* Arbitrary maximum value is two minutes */ if (value > 120000) { value = 120000; } - // Arbitrary minimum value is 2 jiffies + /* Arbitrary minimum value is 2 jiffies */ alb_jiffies = msecs_to_jiffies(value); if (alb_jiffies < 2) { @@ -234,17 +231,16 @@ void set_vdo_dedupe_index_timeout_interval(unsigned int value) vdo_dedupe_index_timeout_jiffies = alb_jiffies; } -/**********************************************************************/ -void set_vdo_dedupe_index_min_timer_interval(unsigned int value) +void vdo_set_dedupe_index_min_timer_interval(unsigned int value) { uint64_t min_jiffies; - // Arbitrary maximum value is one second + /* Arbitrary maximum value is one second */ if (value > 1000) { value = 1000; } - // Arbitrary minimum value is 2 jiffies + /* Arbitrary minimum value is 2 jiffies */ min_jiffies = msecs_to_jiffies(value); if (min_jiffies < 2) { @@ -256,50 +252,45 @@ void set_vdo_dedupe_index_min_timer_interval(unsigned int value) vdo_dedupe_index_min_timer_jiffies = min_jiffies; } -/**********************************************************************/ -static void finish_index_operation(struct uds_request *uds_request) + +static void finish_index_operation(struct uds_request *request) { - struct data_vio *data_vio = container_of(uds_request, + struct data_vio *data_vio = container_of(request, struct data_vio, dedupe_context.uds_request); - struct dedupe_context *dedupe_context = &data_vio->dedupe_context; + struct dedupe_context *context = &data_vio->dedupe_context; - if (atomic_cmpxchg(&dedupe_context->request_state, - UR_BUSY, UR_IDLE) == UR_BUSY) { - struct vio *vio = data_vio_as_vio(data_vio); - struct dedupe_index *index = vio->vdo->dedupe_index; + if (atomic_cmpxchg(&context->request_state, UR_BUSY, UR_IDLE) == + UR_BUSY) { + struct dedupe_index *index = + vdo_from_data_vio(data_vio)->dedupe_index; spin_lock_bh(&index->pending_lock); - if (dedupe_context->is_pending) { - list_del(&dedupe_context->pending_list); - dedupe_context->is_pending = false; + if (context->is_pending) { + list_del(&context->pending_list); + context->is_pending = false; } spin_unlock_bh(&index->pending_lock); - dedupe_context->status = uds_request->status; - if ((uds_request->type == UDS_POST) || - (uds_request->type == UDS_QUERY)) { - struct data_location advice; - - if (decode_uds_advice(uds_request, &advice)) { - vdo_set_dedupe_advice(dedupe_context, &advice); - } else { - vdo_set_dedupe_advice(dedupe_context, NULL); - } + context->status = request->status; + if ((request->type == UDS_POST) || + (request->type == UDS_QUERY)) { + data_vio->is_duplicate = + decode_uds_advice(data_vio, request); } - enqueue_data_vio_callback(data_vio); + continue_data_vio(data_vio, VDO_SUCCESS); atomic_dec(&index->active); } else { - atomic_cmpxchg(&dedupe_context->request_state, + atomic_cmpxchg(&context->request_state, UR_TIMED_OUT, UR_IDLE); } } -/** +/* * Must be called holding pending_lock - **/ + */ static void start_expiration_timer(struct dedupe_index *index, unsigned long expiration) { @@ -309,47 +300,23 @@ static void start_expiration_timer(struct dedupe_index *index, } } -/** +/* * Must be called holding pending_lock - **/ + */ static void start_expiration_timer_for_vio(struct dedupe_index *index, struct data_vio *data_vio) { struct dedupe_context *context = &data_vio->dedupe_context; uint64_t start_time = context->submission_jiffies; - start_expiration_timer(index, get_dedupe_index_timeout(start_time)); -} - -/**********************************************************************/ -static void start_index_operation(struct vdo_work_item *item) -{ - struct vio *vio = work_item_as_vio(item); - struct data_vio *data_vio = vio_as_data_vio(vio); - struct dedupe_index *index = vio->vdo->dedupe_index; - struct dedupe_context *dedupe_context = &data_vio->dedupe_context; - struct uds_request *uds_request = &dedupe_context->uds_request; - int status; - - spin_lock_bh(&index->pending_lock); - list_add_tail(&dedupe_context->pending_list, &index->pending_head); - dedupe_context->is_pending = true; - start_expiration_timer_for_vio(index, data_vio); - spin_unlock_bh(&index->pending_lock); - status = uds_start_chunk_operation(uds_request); - if (status != UDS_SUCCESS) { - uds_request->status = status; - finish_index_operation(uds_request); - } + start_expiration_timer(index, get_dedupe_index_timeout(start_time)); } -/**********************************************************************/ -uint64_t get_vdo_dedupe_index_timeout_count(struct dedupe_index *index) +uint64_t vdo_get_dedupe_index_timeout_count(struct dedupe_index *index) { return atomic64_read(&index->timeout_reporter.value); } -/**********************************************************************/ static void report_events(struct periodic_event_reporter *reporter, bool ratelimit) { @@ -359,15 +326,15 @@ static void report_events(struct periodic_event_reporter *reporter, if (difference != 0) { if (!ratelimit || __ratelimit(&reporter->ratelimiter)) { uds_log_debug("UDS index timeout on %llu requests", - difference); + (unsigned long long) difference); reporter->last_reported_value = new_value; } else { - /** + /* * Turn on a backup timer that will fire after the * current interval. Just in case the last index * request in a while times out; we want to report * the dedupe timeouts in a timely manner in such cases - **/ + */ struct dedupe_index *index = container_of(reporter, struct dedupe_index, @@ -380,7 +347,6 @@ static void report_events(struct periodic_event_reporter *reporter, } } -/**********************************************************************/ static void report_events_work(struct work_struct *work) { struct periodic_event_reporter *reporter = @@ -388,36 +354,33 @@ static void report_events_work(struct work_struct *work) report_events(reporter, true); } -/**********************************************************************/ static void init_periodic_event_reporter(struct periodic_event_reporter *reporter) { INIT_WORK(&reporter->work, report_events_work); ratelimit_default_init(&reporter->ratelimiter); - // Since we will save up the timeouts that would have been reported - // but were ratelimited, we don't need to report ratelimiting. + /* + * Since we will save up the timeouts that would have been reported + * but were ratelimited, we don't need to report ratelimiting. + */ ratelimit_set_flags(&reporter->ratelimiter, RATELIMIT_MSG_ON_RELEASE); } -/** +/* * Record and eventually report that some dedupe requests reached their * expiration time without getting answers, so we timed them out. * * This is called in a timer context, so it shouldn't do the reporting * directly. - * - * @param reporter The periodic event reporter - * @param timeouts How many requests were timed out. - **/ + */ static void report_dedupe_timeouts(struct periodic_event_reporter *reporter, unsigned int timeouts) { atomic64_add(timeouts, &reporter->value); - // If it's already queued, requeueing it will do nothing. + /* If it's already queued, requeueing it will do nothing. */ schedule_work(&reporter->work); } -/**********************************************************************/ static void stop_periodic_event_reporter(struct periodic_event_reporter *reporter) { @@ -426,7 +389,6 @@ stop_periodic_event_reporter(struct periodic_event_reporter *reporter) ratelimit_state_exit(&reporter->ratelimiter); } -/**********************************************************************/ static void timeout_index_operations(struct timer_list *t) { struct dedupe_index *index = from_timer(index, t, pending_timer); @@ -466,7 +428,7 @@ static void timeout_index_operations(struct timer_list *t) if (atomic_cmpxchg(&dedupe_context->request_state, UR_BUSY, UR_TIMED_OUT) == UR_BUSY) { dedupe_context->status = ETIMEDOUT; - enqueue_data_vio_callback(data_vio); + continue_data_vio(data_vio, VDO_SUCCESS); atomic_dec(&index->active); timed_out++; } @@ -474,115 +436,123 @@ static void timeout_index_operations(struct timer_list *t) report_dedupe_timeouts(&index->timeout_reporter, timed_out); } -/**********************************************************************/ -void enqueue_vdo_index_operation(struct data_vio *data_vio, - enum uds_request_type operation) +static void prepare_uds_request(struct uds_request *request, + struct data_vio *data_vio, + struct uds_index_session *session, + enum uds_request_type operation) { - struct vio *vio = data_vio_as_vio(data_vio); - struct dedupe_context *dedupe_context = &data_vio->dedupe_context; - struct dedupe_index *index = vio->vdo->dedupe_index; - - dedupe_context->status = UDS_SUCCESS; - dedupe_context->submission_jiffies = jiffies; - if (atomic_cmpxchg(&dedupe_context->request_state, - UR_IDLE, UR_BUSY) == UR_IDLE) { - struct uds_request *uds_request = - &data_vio->dedupe_context.uds_request; - - uds_request->chunk_name = data_vio->chunk_name; - uds_request->callback = finish_index_operation; - uds_request->session = index->index_session; - uds_request->type = operation; - uds_request->update = true; - if ((operation == UDS_POST) || (operation == UDS_UPDATE)) { - encode_uds_advice(uds_request, - vdo_get_dedupe_advice(dedupe_context)); - } + request->chunk_name = data_vio->chunk_name; + request->callback = finish_index_operation; + request->session = session; + request->type = operation; + if ((operation == UDS_POST) || (operation == UDS_UPDATE)) { + size_t offset = 0; + struct uds_chunk_data *encoding = &request->new_metadata; - setup_work_item(work_item_from_vio(vio), - start_index_operation, - NULL, - UDS_Q_ACTION); - - spin_lock(&index->state_lock); - if (index->deduping) { - unsigned int active; + encoding->data[offset++] = UDS_ADVICE_VERSION; + encoding->data[offset++] = data_vio->new_mapped.state; + put_unaligned_le64(data_vio->new_mapped.pbn, + &encoding->data[offset]); + offset += sizeof(uint64_t); + BUG_ON(offset != UDS_ADVICE_SIZE); + } +} - enqueue_work_queue(index->uds_queue, - work_item_from_vio(vio)); +/* + * The index operation will inquire about data_vio.chunk_name, providing (if + * the operation is appropriate) advice from the data_vio's new_mapped + * fields. The advice found in the index (or NULL if none) will be returned via + * receive_data_vio_dedupe_advice(). dedupe_context.status is set to the return + * status code of any asynchronous index processing. + */ +void vdo_query_index(struct data_vio *data_vio, + enum uds_request_type operation) +{ + struct dedupe_context *context = &data_vio->dedupe_context; + struct vdo *vdo = vdo_from_data_vio(data_vio); + struct dedupe_index *index = vdo->dedupe_index; + struct uds_request *request; + unsigned int active; + int result; - active = atomic_inc_return(&index->active); - if (active > index->maximum) { - index->maximum = active; - } - vio = NULL; - } else { - atomic_set(&dedupe_context->request_state, UR_IDLE); - } - spin_unlock(&index->state_lock); - } else { - // A previous user of the vio had a dedupe timeout - // and its request is still outstanding. - atomic64_inc(&vio->vdo->stats.dedupe_context_busy); + vdo_assert_on_dedupe_thread(vdo, __func__); + context->status = UDS_SUCCESS; + context->submission_jiffies = jiffies; + request = &context->uds_request; + prepare_uds_request(request, + data_vio, + index->index_session, + operation); + active = atomic_inc_return(&index->active); + if (active > index->maximum) { + index->maximum = active; } - if (vio != NULL) { - enqueue_data_vio_callback(data_vio); + spin_lock_bh(&index->pending_lock); + list_add_tail(&context->pending_list, &index->pending_head); + context->is_pending = true; + start_expiration_timer_for_vio(index, data_vio); + spin_unlock_bh(&index->pending_lock); + + result = uds_start_chunk_operation(request); + if (result != UDS_SUCCESS) { + request->status = result; + finish_index_operation(request); } } -/**********************************************************************/ static void close_index(struct dedupe_index *index) { int result; - // Change the index state so that get_vdo_dedupe_index_statistics will - // not try to use the index session we are closing. + /* + * Change the index state so that vdo_get_dedupe_index_statistics will + * not try to use the index session we are closing. + */ index->index_state = IS_CHANGING; - // Close the index session, while not holding the state_lock. + /* Close the index session, while not holding the state_lock. */ spin_unlock(&index->state_lock); result = uds_close_index(index->index_session); if (result != UDS_SUCCESS) { - uds_log_error_strerror(result, - "Error closing index %s", - index->index_name); + uds_log_error_strerror(result, "Error closing index"); } spin_lock(&index->state_lock); index->index_state = IS_CLOSED; index->error_flag |= result != UDS_SUCCESS; - // ASSERTION: We leave in IS_CLOSED state. + /* ASSERTION: We leave in IS_CLOSED state. */ } -/**********************************************************************/ static void open_index(struct dedupe_index *index) { - // ASSERTION: We enter in IS_CLOSED state. + /* ASSERTION: We enter in IS_CLOSED state. */ int result; bool create_flag = index->create_flag; index->create_flag = false; - // Change the index state so that the it will be reported to the - // outside world as "opening". + /* + * Change the index state so that the it will be reported to the + * outside world as "opening". + */ index->index_state = IS_CHANGING; index->error_flag = false; - // Open the index session, while not holding the state_lock + /* Open the index session, while not holding the state_lock */ spin_unlock(&index->state_lock); result = uds_open_index(create_flag ? UDS_CREATE : UDS_LOAD, - index->index_name, &index->uds_params, - index->configuration, index->index_session); + &index->parameters, + index->index_session); if (result != UDS_SUCCESS) { - uds_log_error_strerror(result, - "Error opening index %s", - index->index_name); + uds_log_error_strerror(result, "Error opening index"); } spin_lock(&index->state_lock); if (!create_flag) { switch (result) { case -ENOENT: - // Either there is no index, or there is no way we can - // recover the index. We will be called again and try - // to create a new index. + /* + * Either there is no index, or there is no way we can + * recover the index. We will be called again and try + * to create a new index. + */ index->index_state = IS_CLOSED; index->create_flag = true; return; @@ -600,20 +570,22 @@ static void open_index(struct dedupe_index *index) uds_log_info("Setting UDS index target state to error"); spin_lock(&index->state_lock); } - // ASSERTION: On success, we leave in IS_OPENED state. - // ASSERTION: On failure, we leave in IS_CLOSED state. + /* + * ASSERTION: On success, we leave in IS_OPENED state. + * ASSERTION: On failure, we leave in IS_CLOSED state. + */ } -/**********************************************************************/ -static void change_dedupe_state(struct vdo_work_item *item) +static void change_dedupe_state(struct vdo_completion *completion) { - struct dedupe_index *index = container_of(item, - struct dedupe_index, - work_item); + struct dedupe_index *index = as_dedupe_index(completion); + spin_lock(&index->state_lock); - // Loop until the index is in the target state and the create flag is - // clear. + /* + * Loop until the index is in the target state and the create flag is + * clear. + */ while (!index->suspended && ((index->index_state != index->index_target) || index->create_flag)) { @@ -629,13 +601,14 @@ static void change_dedupe_state(struct vdo_work_item *item) spin_unlock(&index->state_lock); } -/**********************************************************************/ static void launch_dedupe_state_change(struct dedupe_index *index) { - // ASSERTION: We enter with the state_lock held. + /* ASSERTION: We enter with the state_lock held. */ if (index->changing || index->suspended) { - // Either a change is already in progress, or changes are - // not allowed. + /* + * Either a change is already in progress, or changes are + * not allowed. + */ return; } @@ -643,22 +616,17 @@ static void launch_dedupe_state_change(struct dedupe_index *index) (index->index_state != index->index_target)) { index->changing = true; index->deduping = false; - setup_work_item(&index->work_item, - change_dedupe_state, - NULL, - UDS_Q_ACTION); - enqueue_work_queue(index->uds_queue, &index->work_item); + vdo_invoke_completion_callback(&index->completion); return; } - // Online vs. offline changes happen immediately + /* Online vs. offline changes happen immediately */ index->deduping = (index->dedupe_flag && !index->suspended && (index->index_state == IS_OPENED)); - // ASSERTION: We exit with the state_lock held. + /* ASSERTION: We exit with the state_lock held. */ } -/**********************************************************************/ static void set_target_state(struct dedupe_index *index, enum index_state target, bool change_dedupe, @@ -686,8 +654,11 @@ static void set_target_state(struct dedupe_index *index, } } -/**********************************************************************/ -void suspend_vdo_dedupe_index(struct dedupe_index *index, bool save_flag) +/* + * May be called from any thread. + * save_flag should be true to save the index instead of just suspend. + */ +void vdo_suspend_dedupe_index(struct dedupe_index *index, bool save_flag) { enum index_state state; @@ -706,41 +677,18 @@ void suspend_vdo_dedupe_index(struct dedupe_index *index, bool save_flag) } } -/**********************************************************************/ -int make_new_vdo_index_name(struct dedupe_index *index, - char* new_device_name, - char** new_index_name) -{ - int result; - - /* Index parameters in the name can't change so just copy them. */ - result = uds_alloc_sprintf("index resume name", new_index_name, - "dev=%s%s", - new_device_name, - strstr(index->index_name, " ")); - if (result != UDS_SUCCESS) { - uds_log_error("Creating index name failed (%d)", result); - } - - return result; -} - -/**********************************************************************/ -void resume_vdo_dedupe_index(struct dedupe_index *index, +/* + * May be called from any thread. + */ +void vdo_resume_dedupe_index(struct dedupe_index *index, struct device_config *config) { int result; - char *new_index_name = config->index_name; - - if (new_index_name != NULL) { - /* Transfer ownership of the index name to the dedupe index */ - UDS_FREE(index->index_name); - index->index_name = config->index_name; - config->index_name = NULL; - } + index->parameters.name = config->parent_device_name; result = uds_resume_index_session(index->index_session, - config->parent_device_name); + index->parameters.name); + if (result != UDS_SUCCESS) { uds_log_error_strerror(result, "Error resuming dedupe index"); } @@ -759,9 +707,10 @@ void resume_vdo_dedupe_index(struct dedupe_index *index, spin_unlock(&index->state_lock); } - -/**********************************************************************/ -void dump_vdo_dedupe_index(struct dedupe_index *index, bool show_queue) +/* + * Do the dedupe section of dmsetup message vdo0 0 dump ... + */ +void vdo_dump_dedupe_index(struct dedupe_index *index) { const char *state, *target; @@ -776,31 +725,29 @@ void dump_vdo_dedupe_index(struct dedupe_index *index, bool show_queue) if (target != NULL) { uds_log_info("UDS index: changing to state: %s", target); } - if (show_queue) { - dump_work_queue(index->uds_queue); - } } -/**********************************************************************/ -void finish_vdo_dedupe_index(struct dedupe_index *index) +void vdo_finish_dedupe_index(struct dedupe_index *index) { if (index == NULL) { return; } - set_target_state(index, IS_CLOSED, false, false, false); uds_destroy_index_session(index->index_session); finish_work_queue(index->uds_queue); } -/**********************************************************************/ -void free_vdo_dedupe_index(struct dedupe_index *index) +void vdo_free_dedupe_index(struct dedupe_index *index) { if (index == NULL) { return; } - free_work_queue(UDS_FORGET(index->uds_queue)); + /* + * The queue will get freed along with all the others, but give up + * our reference to it. + */ + UDS_FORGET(index->uds_queue); stop_periodic_event_reporter(&index->timeout_reporter); spin_lock_bh(&index->pending_lock); if (index->started_timer) { @@ -810,8 +757,7 @@ void free_vdo_dedupe_index(struct dedupe_index *index) kobject_put(&index->dedupe_directory); } -/**********************************************************************/ -const char *get_vdo_dedupe_index_state_name(struct dedupe_index *index) +const char *vdo_get_dedupe_index_state_name(struct dedupe_index *index) { const char *state; @@ -822,8 +768,7 @@ const char *get_vdo_dedupe_index_state_name(struct dedupe_index *index) return state; } -/**********************************************************************/ -void get_vdo_dedupe_index_statistics(struct dedupe_index *index, +void vdo_get_dedupe_index_statistics(struct dedupe_index *index, struct index_statistics *stats) { enum index_state state; @@ -856,8 +801,10 @@ void get_vdo_dedupe_index_statistics(struct dedupe_index *index, } -/**********************************************************************/ -int message_vdo_dedupe_index(struct dedupe_index *index, const char *name) +/* + * Handle a dmsetup message relevant to the index. + */ +int vdo_message_dedupe_index(struct dedupe_index *index, const char *name) { if (strcasecmp(name, "index-close") == 0) { set_target_state(index, IS_CLOSED, false, false, false); @@ -875,37 +822,29 @@ int message_vdo_dedupe_index(struct dedupe_index *index, const char *name) return -EINVAL; } -/**********************************************************************/ -int add_vdo_dedupe_index_sysfs(struct dedupe_index *index, +int vdo_add_dedupe_index_sysfs(struct dedupe_index *index, struct kobject *parent) { return kobject_add(&index->dedupe_directory, parent, "dedupe"); } -/**********************************************************************/ -void start_vdo_dedupe_index(struct dedupe_index *index, bool create_flag) +/* + * If create_flag, create a new index without first attempting to load an + * existing index. + */ +void vdo_start_dedupe_index(struct dedupe_index *index, bool create_flag) { set_target_state(index, IS_OPENED, true, true, create_flag); } -/**********************************************************************/ -void stop_vdo_dedupe_index(struct dedupe_index *index) -{ - set_target_state(index, IS_CLOSED, false, false, false); -} - -/**********************************************************************/ static void dedupe_kobj_release(struct kobject *directory) { struct dedupe_index *index = container_of(directory, struct dedupe_index, dedupe_directory); - uds_free_configuration(index->configuration); - UDS_FREE(index->index_name); UDS_FREE(index); } -/**********************************************************************/ static ssize_t dedupe_status_show(struct kobject *directory, struct attribute *attr, char *buf) @@ -921,16 +860,15 @@ static ssize_t dedupe_status_show(struct kobject *directory, } } -/**********************************************************************/ -static ssize_t dedupe_status_store(struct kobject *kobj, - struct attribute *attr, - const char *buf, - size_t length) +static ssize_t dedupe_status_store(struct kobject *kobj __always_unused, + struct attribute *attr __always_unused, + const char *buf __always_unused, + size_t length __always_unused) { return -EINVAL; } -/**********************************************************************/ +/*----------------------------------------------------------------------*/ static struct sysfs_ops dedupe_sysfs_ops = { .show = dedupe_status_show, @@ -939,21 +877,21 @@ static struct sysfs_ops dedupe_sysfs_ops = { static struct uds_attribute dedupe_status_attribute = { .attr = {.name = "status", .mode = 0444, }, - .show_string = get_vdo_dedupe_index_state_name, + .show_string = vdo_get_dedupe_index_state_name, }; -static struct attribute *dedupe_attributes[] = { +static struct attribute *dedupe_attrs[] = { &dedupe_status_attribute.attr, NULL, }; +ATTRIBUTE_GROUPS(dedupe); static struct kobj_type dedupe_directory_type = { .release = dedupe_kobj_release, .sysfs_ops = &dedupe_sysfs_ops, - .default_attrs = dedupe_attributes, + .default_groups = dedupe_groups, }; -/**********************************************************************/ static void start_uds_queue(void *ptr) { /* @@ -964,36 +902,30 @@ static void start_uds_queue(void *ptr) * (like the base threads do), but it would be an unnecessary * embellishment. */ - struct dedupe_index *index = ptr; - uds_register_allocating_thread(&index->allocating_thread, NULL); + struct vdo_thread *thread + = get_work_queue_owner(get_current_work_queue()); + + uds_register_allocating_thread(&thread->allocating_thread, NULL); } -/**********************************************************************/ static void finish_uds_queue(void *ptr __always_unused) { uds_unregister_allocating_thread(); } -/**********************************************************************/ -int make_vdo_dedupe_index(struct dedupe_index **index_ptr, - struct vdo *vdo, - const char* thread_name_prefix) +int vdo_make_dedupe_index(struct vdo *vdo, struct dedupe_index **index_ptr) { int result; off_t uds_offset; struct dedupe_index *index; - struct index_config *index_config; static const struct vdo_work_queue_type uds_queue_type = { .start = start_uds_queue, .finish = finish_uds_queue, - .action_table = { - { .name = "uds_action", - .code = UDS_Q_ACTION, - .priority = 0 }, - }, + .max_priority = UDS_Q_MAX_PRIORITY, + .default_priority = UDS_Q_PRIORITY, }; - set_vdo_dedupe_index_timeout_interval(vdo_dedupe_index_timeout_interval); - set_vdo_dedupe_index_min_timer_interval(vdo_dedupe_index_min_timer_interval); + vdo_set_dedupe_index_timeout_interval(vdo_dedupe_index_timeout_interval); + vdo_set_dedupe_index_min_timer_interval(vdo_dedupe_index_min_timer_interval); result = UDS_ALLOCATE(1, struct dedupe_index, "UDS index data", &index); @@ -1003,66 +935,76 @@ int make_vdo_dedupe_index(struct dedupe_index **index_ptr, uds_offset = ((vdo_get_index_region_start(vdo->geometry) - vdo->geometry.bio_offset) * VDO_BLOCK_SIZE); - result = uds_alloc_sprintf("index name", &index->index_name, - "dev=%s offset=%ld size=%llu", - vdo->device_config->parent_device_name, - uds_offset, - (vdo_get_index_region_size(vdo->geometry) * - VDO_BLOCK_SIZE)); - if (result != UDS_SUCCESS) { - uds_log_error("Creating index name failed (%d)", result); - UDS_FREE(index); - return result; - } - - index->uds_params = (struct uds_parameters) UDS_PARAMETERS_INITIALIZER; - index_config = &vdo->geometry.index_config; - vdo_index_config_to_uds_parameters(index_config, &index->uds_params); - result = vdo_index_config_to_uds_configuration(index_config, - &index->configuration); - if (result != VDO_SUCCESS) { - UDS_FREE(index->index_name); - UDS_FREE(index); - return result; - } - uds_configuration_set_nonce(index->configuration, - (uds_nonce_t) vdo->geometry.nonce); + index->parameters.name = vdo->device_config->parent_device_name; + index->parameters.offset = uds_offset; + index->parameters.size = + vdo_get_index_region_size(vdo->geometry) * VDO_BLOCK_SIZE; + index->parameters.memory_size = vdo->geometry.index_config.mem; + index->parameters.sparse = vdo->geometry.index_config.sparse; + index->parameters.nonce = (uint64_t) vdo->geometry.nonce; result = uds_create_index_session(&index->index_session); if (result != UDS_SUCCESS) { - uds_free_configuration(index->configuration); - UDS_FREE(index->index_name); UDS_FREE(index); return result; } - result = make_work_queue(thread_name_prefix, - "dedupeQ", - vdo, - index, + result = vdo_make_thread(vdo, + vdo->thread_config->dedupe_thread, &uds_queue_type, 1, - NULL, - &index->uds_queue); + NULL); if (result != VDO_SUCCESS) { uds_log_error("UDS index queue initialization failed (%d)", result); uds_destroy_index_session(index->index_session); - uds_free_configuration(index->configuration); - UDS_FREE(index->index_name); UDS_FREE(index); return result; } + vdo_initialize_completion(&index->completion, + vdo, + VDO_DEDUPE_INDEX_COMPLETION); + vdo_set_completion_callback(&index->completion, + change_dedupe_state, + vdo->thread_config->dedupe_thread); + index->uds_queue + = vdo->threads[vdo->thread_config->dedupe_thread].queue; kobject_init(&index->dedupe_directory, &dedupe_directory_type); INIT_LIST_HEAD(&index->pending_head); spin_lock_init(&index->pending_lock); spin_lock_init(&index->state_lock); timer_setup(&index->pending_timer, timeout_index_operations, 0); - // UDS Timeout Reporter init_periodic_event_reporter(&index->timeout_reporter); *index_ptr = index; return VDO_SUCCESS; } + +bool data_vio_may_query_index(struct data_vio *data_vio) +{ + struct vdo *vdo = vdo_from_data_vio(data_vio); + struct dedupe_index *index = vdo->dedupe_index; + bool deduping; + + spin_lock(&index->state_lock); + deduping = index->deduping; + spin_unlock(&index->state_lock); + + if (!deduping) { + return false; + } + + if (atomic_cmpxchg(&data_vio->dedupe_context.request_state, + UR_IDLE, UR_BUSY) != UR_IDLE) { + /* + * A previous user of the data_vio had a dedupe timeout + * and its request is still outstanding. + */ + atomic64_inc(&vdo->stats.dedupe_context_busy); + return false; + } + + return true; +} diff --git a/vdo/dedupe-index.h b/vdo/dedupe-index.h new file mode 100644 index 00000000..75efaf1c --- /dev/null +++ b/vdo/dedupe-index.h @@ -0,0 +1,65 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright Red Hat + */ + +#ifndef DEDUPE_INDEX_H +#define DEDUPE_INDEX_H + +#include + +#include "uds.h" + +#include "kernel-types.h" +#include "statistics.h" +#include "types.h" + +int __must_check +vdo_make_dedupe_index(struct vdo *vdo, struct dedupe_index **index_ptr); + +void vdo_dump_dedupe_index(struct dedupe_index *index); + +void vdo_free_dedupe_index(struct dedupe_index *index); + +const char *vdo_get_dedupe_index_state_name(struct dedupe_index *index); + +uint64_t vdo_get_dedupe_index_timeout_count(struct dedupe_index *index); + +void vdo_get_dedupe_index_statistics(struct dedupe_index *index, + struct index_statistics *stats); + +int vdo_message_dedupe_index(struct dedupe_index *index, const char *name); + +void vdo_query_index(struct data_vio *data_vio, + enum uds_request_type operation); + +int vdo_add_dedupe_index_sysfs(struct dedupe_index *index, + struct kobject *parent); + +void vdo_start_dedupe_index(struct dedupe_index *index, bool create_flag); + +void vdo_suspend_dedupe_index(struct dedupe_index *index, bool save_flag); + +void vdo_resume_dedupe_index(struct dedupe_index *index, + struct device_config *config); + +void vdo_finish_dedupe_index(struct dedupe_index *index); + +/* + * Interval (in milliseconds) from submission until switching to fast path and + * skipping UDS. + */ +extern unsigned int vdo_dedupe_index_timeout_interval; + +/* + * Minimum time interval (in milliseconds) between timer invocations to + * check for requests waiting for UDS that should now time out. + */ +extern unsigned int vdo_dedupe_index_min_timer_interval; + +void vdo_set_dedupe_index_timeout_interval(unsigned int value); +void vdo_set_dedupe_index_min_timer_interval(unsigned int value); + +bool data_vio_may_query_index(struct data_vio *data_vio); + +#endif /* DEDUPE_INDEX_H */ diff --git a/vdo/dedupeIndex.h b/vdo/dedupeIndex.h deleted file mode 100644 index b43a0c68..00000000 --- a/vdo/dedupeIndex.h +++ /dev/null @@ -1,244 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/kernel/dedupeIndex.h#22 $ - */ - -#ifndef DEDUPE_INDEX_H -#define DEDUPE_INDEX_H - -#include "uds.h" - -#include "dataKVIO.h" -#include "types.h" - -/** - * Make a dedupe index - * - * @param index_ptr dedupe index returned here - * @param vdo the vdo to which the index will belong - * @param thread_name_prefix The per-device prefix to use in thread names - * - * @return VDO_SUCCESS or an error code - **/ -int __must_check -make_vdo_dedupe_index(struct dedupe_index **index_ptr, - struct vdo *vdo, - const char *thread_name_prefix); - - -/** - * Do the dedupe section of dmsetup message vdo0 0 dump ... - * - * @param index The dedupe index - * @param show_queue true to dump a dedupe work queue - **/ -void dump_vdo_dedupe_index(struct dedupe_index *index, bool show_queue); - -/** - * Free the dedupe index - * - * @param index The dedupe index - **/ -void free_vdo_dedupe_index(struct dedupe_index *index); - -/** - * Get the name of the deduplication state - * - * @param index The dedupe index - * - * @return the dedupe state name - **/ -const char *get_vdo_dedupe_index_state_name(struct dedupe_index *index); - -/** - * Get the dedupe timeout count. - * - * @param index The dedupe index - * - * @return The number of dedupe timeouts noted - **/ -uint64_t get_vdo_dedupe_index_timeout_count(struct dedupe_index *index); - -/** - * Get the index statistics - * - * @param index The dedupe index - * @param stats The index statistics - **/ -void get_vdo_dedupe_index_statistics(struct dedupe_index *index, - struct index_statistics *stats); - -/** - * Process a dmsetup message directed to the index. - * - * @param index The dedupe index - * @param name The message name - * - * @return 0 or an error code - **/ -int message_vdo_dedupe_index(struct dedupe_index *index, const char *name); - -/** - * Enqueue operation for submission to the index. - * - * @param data_vio The data_vio requesting the operation - * @param operation The index operation to perform - **/ -void enqueue_vdo_index_operation(struct data_vio *data_vio, - enum uds_request_type operation); - -/** - * Look up the chunkname of the data_vio and identify duplicated chunks. - * - * @param data_vio The data_vio. These fields are used: - * data_vio.chunk_name is the chunk name. The advice to - * offer to the index will be obtained via - * vdo_get_dedupe_advice(). The advice found in the index (or - * NULL if none) will be returned via vdo_set_dedupe_advice(). - * dedupe_context.status is set to the return status code of - * any asynchronous index processing. - **/ -static inline void post_vdo_dedupe_advice(struct data_vio *data_vio) -{ - enqueue_vdo_index_operation(data_vio, UDS_POST); -} - -/** - * Look up the chunk_name of the data_vio and identify duplicated chunks. - * - * @param data_vio The data_vio. These fields are used: - * data_vio.chunk_name is the chunk name. The advice - * found in the index (or NULL if none) will be returned via - * vdo_set_dedupe_advice(). dedupe_context.status is set to - * the return status code of any asynchronous index - * processing. - **/ -static inline void query_vdo_dedupe_advice(struct data_vio *data_vio) -{ - enqueue_vdo_index_operation(data_vio, UDS_QUERY); -} - -/** - * Look up the chunk_name of the data_vio and associate the new PBN with the - * name. - * - * @param data_vio The data_vio. These fields are used: - * data_vio.chunk_name is the chunk name. The advice to - * offer to the index will be obtained via - * vdo_get_dedupe_advice(). dedupe_context.status is set to - * the return status code of any asynchronous index - * processing. - **/ -static inline void update_vdo_dedupe_advice(struct data_vio *data_vio) -{ - enqueue_vdo_index_operation(data_vio, UDS_UPDATE); -} - -/** - * Add the sysfs nodes for the dedupe index. - * - * @param index The dedupe index - * @param parent The kobject to attach the sysfs nodes to - * - * @return 0 or an error code - **/ -int add_vdo_dedupe_index_sysfs(struct dedupe_index *index, - struct kobject *parent); - -/** - * Start the dedupe index. - * - * @param index The dedupe index - * @param create_flag If true, create a new index without first attempting - * to load an existing index - **/ -void start_vdo_dedupe_index(struct dedupe_index *index, bool create_flag); - -/** - * Stop the dedupe index. May be called by any thread, but will wait for - * the shutdown to be completed. - * - * @param index The dedupe index - **/ -void stop_vdo_dedupe_index(struct dedupe_index *index); - -/** - * Wait until the dedupe index has completed all its outstanding I/O. - * May be called from any thread, - * - * @param index The dedupe index - * @param save_flag True if we should save the index - **/ -void suspend_vdo_dedupe_index(struct dedupe_index *index, bool save_flag); - -/** - * Construct a new index name for resume. - * - * @param index The dedupe index - * @param new_device_name The name of the new backing device - * @param new_index_name A pointer to the new index name - * - * @return VDO_SUCCESS or an error code - **/ -int __must_check make_new_vdo_index_name(struct dedupe_index *index, - char* new_device_name, - char** new_index_name); - -/** - * Resume a suspended dedupe index. May be called from any thread. - * - * @param index The dedupe index - * @param config The new device configuration - * @param create Whether to create the index or not - **/ -void resume_vdo_dedupe_index(struct dedupe_index *index, - struct device_config *config); - -/** - * Finish the dedupe index. - * - * @param index The dedupe index - **/ -void finish_vdo_dedupe_index(struct dedupe_index *index); - -// Interval (in milliseconds or jiffies) from submission until switching to -// fast path and skipping UDS. -extern unsigned int vdo_dedupe_index_timeout_interval; - -// Minimum time interval (in milliseconds) between timer invocations to -// check for requests waiting for UDS that should now time out. -extern unsigned int vdo_dedupe_index_min_timer_interval; - -/** - * Set the interval from submission until switching to fast path and - * skipping UDS. - * - * @param value The number of milliseconds - **/ -void set_vdo_dedupe_index_timeout_interval(unsigned int value); - -/** - * Set the minimum time interval between timer invocations to check for - * requests waiting for UDS that should now time out. - * - * @param value The number of milliseconds - **/ -void set_vdo_dedupe_index_min_timer_interval(unsigned int value); - -#endif /* DEDUPE_INDEX_H */ diff --git a/vdo/delta-index.c b/vdo/delta-index.c new file mode 100644 index 00000000..994ed2ba --- /dev/null +++ b/vdo/delta-index.c @@ -0,0 +1,2618 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright Red Hat + */ +#include "delta-index.h" + +#include "buffer.h" +#include "compiler.h" +#include "config.h" +#include "cpu.h" +#include "errors.h" +#include "hash-utils.h" +#include "logger.h" +#include "memory-alloc.h" +#include "permassert.h" +#include "string-utils.h" +#include "time-utils.h" +#include "type-defs.h" +#include "uds.h" + +/* + * A delta index is a key-value store, where each entry maps an address (the + * key) to a payload (the value). The entries are sorted by address, and only + * the delta between successive addresses is stored in the entry. The + * addresses are assumed to be uniformly distributed, and the deltas are + * therefore exponentially distributed. + * + * The entries could be stored in a single delta list, but for efficiency we + * use multiple delta lists. These lists are stored in a single chunk of + * memory managed by the delta_zone structure. The delta_zone can move the + * data around within its memory, so we never keep any byte pointers into the + * memory. We only keep offsets into the memory. + * + * The delta lists are stored as bit streams. These bit streams are stored in + * little endian order, and all offsets into delta_memory are bit offsets. + * + * All entries are stored as a fixed length payload (the value) followed by a + * variable length key (the delta), and always strictly in little endian + * order. + * + * A collision entry is used when two block names have the same delta list + * address. A collision entry is encoded with DELTA == 0, and has 256 + * extension bits containing the full block name. + * + * The DELTA == 0 encoding usually indicates a collision with the preceding + * entry, but for the first entry in any delta list there is no preceding + * entry, so the DELTA == 0 encoding at the beginning of a delta list + * indicates a normal entry. + * + * The Huffman code is driven by 3 parameters: + * + * MINBITS This is the number of bits in the smallest code + * + * BASE This is the number of values coded using a code of length MINBITS + * + * INCR This is the number of values coded by using one additional bit + * + * These parameters are related by: + * + * BASE + INCR == 1 << MINBITS + * + * When we create an index, we need to know the mean delta. From the mean + * delta, we compute these three parameters. The math for the Huffman code of + * an exponential distribution says that we compute + * + * INCR = log(2) * MEAN_DELTA + * + * Then we find the smallest MINBITS so that + * + * (1 << MINBITS) > INCR + * + * And then + * + * BASE = (1 << MINBITS) - INCR + * + * Now we need a code such that + * + * - The first BASE values code using MINBITS bits. + * - The next INCR values code using MINBITS+1 bits. + * - The next INCR values code using MINBITS+2 bits. + * - The next INCR values code using MINBITS+3 bits. + * - (and so on). + * + * ENCODE(DELTA): + * + * if (DELTA < BASE) { + * put DELTA in MINBITS bits; + * } else { + * T1 = (DELTA - BASE) % INCR + BASE; + * T2 = (DELTA - BASE) / INCR; + * put T1 in MINBITS bits; + * put 0 in T2 bits; + * put 1 in 1 bit; + * } + * + * DECODE(BIT_STREAM): + * + * T1 = next MINBITS bits of stream; + * if (T1 < BASE) { + * DELTA = T1; + * } else { + * Scan bits in the stream until reading a 1, + * setting T2 to the number of 0 bits read; + * DELTA = T2 * INCR + T1; + * } + * + * The bit field utilities that we use on the delta lists assume that it is + * possible to read a few bytes beyond the end of the bit field, so we make + * sure to allocate some extra bytes at the end of memory containing the delta + * lists. Consult the bit utilities documentation for more details. + * + * Note that the decode bit stream code includes a step that skips over 0 bits + * until the first 1 bit is found. A corrupted delta list could cause this + * step to run off the end of the delta list memory. As an extra protection + * against this happening, the guard bytes at the end should be set to all + * ones. + */ + +/* + * The delta_zone structure manages the memory that stores delta lists. + * Because the volume index can contain a million delta lists or more, we + * want to be efficient with the delta list header size. + * + * The delta list information is encoded into 16 bytes per list. The volume + * index delta list memory can easily exceed 4 gigabits, so we must use a + * uint64_t to address the memory. The volume index delta lists average around + * 6 kilobits, so we can represent the size of a delta list with a uint16_t. + * + * The delta memory contains N delta lists, which are guarded by two + * empty delta lists. The valid delta lists are numbered 1 to N, and the + * guard lists are numbered 0 and N+1. + * + * The delta_zone supports two different forms. The mutable form is created + * by initialize_delta_zone(), and is used for the volume index and for open + * chapter indexes. The immutable form is created by + * initialize_delta_zone_page(), and is used for cached chapter index + * pages. The immutable form does not allocate delta list headers or temporary + * offsets, and thus is somewhat more memory efficient. + */ + +/* + * These bit stream and bit field utility routines are used for the delta + * indexes, which are not byte-aligned. + * + * Bits and bytes are numbered in little endian order. Within a byte, bit 0 + * is the least significant bit (0x1), and bit 7 is the most significant bit + * (0x80). Within a bit stream, bit 7 is the most signficant bit of byte 0, + * and bit 8 is the least significant bit of byte 1. Within a byte array, a + * byte's number corresponds to its index in the array. + * + * This implementation assumes that the native machine is little endian, and + * that performance is very important. + */ + +/* This is the number of bits in a uint32_t. */ +enum { + UINT32_BITS = sizeof(uint32_t) * CHAR_BIT, +}; + +/* + * This is the largest field size supported by get_field() and set_field(). + * Any field that is larger is not guaranteed to fit in a single byte-aligned + * uint32_t. + */ +enum { + MAX_FIELD_BITS = (sizeof(uint32_t) - 1) * CHAR_BIT + 1, +}; + +/* + * This is the largest field size supported by get_big_field() and + * set_big_field(). Any field that is larger is not guaranteed to fit in a + * single byte-aligned uint64_t. + */ +enum { + MAX_BIG_FIELD_BITS = (sizeof(uint64_t) - 1) * CHAR_BIT + 1, +}; + +/* + * This is the number of guard bytes needed at the end of the memory byte + * array when using the bit utilities. 3 bytes are needed when get_field() and + * set_field() access a field, because they will access some extra bytes past + * the end of the field. 7 bytes are needed when get_big_field() and + * set_big_field() access a big field, for the same reason. Note that + * move_bits() calls get_big_field() and set_big_field(). The definition is + * written to make it clear how it is derived. + */ +enum { + POST_FIELD_GUARD_BYTES = sizeof(uint64_t) - 1, +}; + +/* The number of guard bits that are needed in the tail guard list */ +enum { + GUARD_BITS = POST_FIELD_GUARD_BYTES * CHAR_BIT +}; + +/* + * The maximum size of a single delta list (in bytes). We count guard bytes + * in this value because a buffer of this size can be used with move_bits(). + */ +enum { + DELTA_LIST_MAX_BYTE_COUNT = + ((UINT16_MAX + CHAR_BIT) / CHAR_BIT + POST_FIELD_GUARD_BYTES) +}; + +/* The number of extra bytes and bits needed to store a collision entry */ +enum { + COLLISION_BYTES = UDS_CHUNK_NAME_SIZE, + COLLISION_BITS = COLLISION_BYTES * CHAR_BIT +}; + +/* + * Immutable delta lists are packed into pages containing a header that + * encodes the delta list information into 19 bits per list (64KB bit offset). + */ + +enum { IMMUTABLE_HEADER_SIZE = 19 }; + +/* + * Constants and structures for the saved delta index. "DI" is for + * delta_index, and -##### is a number to increment when the format of the + * data changes. + */ + +enum { + MAGIC_SIZE = 8, +}; + +static const char DELTA_INDEX_MAGIC[] = "DI-00002"; + +struct delta_index_header { + char magic[MAGIC_SIZE]; + uint32_t zone_number; + uint32_t zone_count; + uint32_t first_list; + uint32_t list_count; + uint64_t record_count; + uint64_t collision_count; +}; + +/* + * Header data used for immutable delta index pages. This data is followed by + * the delta list offset table. + */ +struct delta_page_header { + /* Externally-defined nonce */ + uint64_t nonce; + /* The virtual chapter number */ + uint64_t virtual_chapter_number; + /* Index of the first delta list on the page */ + uint16_t first_list; + /* Number of delta lists on the page */ + uint16_t list_count; +} __packed; + +static INLINE uint64_t +get_delta_list_byte_start(const struct delta_list *delta_list) +{ + return delta_list->start / CHAR_BIT; +} + +static INLINE uint16_t +get_delta_list_byte_size(const struct delta_list *delta_list) +{ + unsigned int bit_offset = delta_list->start % CHAR_BIT; + + return DIV_ROUND_UP(bit_offset + delta_list->size, CHAR_BIT); +} + +static void rebalance_delta_zone(const struct delta_zone *delta_zone, + unsigned int first, + unsigned int last) +{ + struct delta_list *delta_list; + uint64_t new_start; + + if (first == last) { + /* Only one list is moving, and we know there is space. */ + delta_list = &delta_zone->delta_lists[first]; + new_start = delta_zone->new_offsets[first]; + if (delta_list->start != new_start) { + uint64_t source; + uint64_t destination; + + source = get_delta_list_byte_start(delta_list); + delta_list->start = new_start; + destination = get_delta_list_byte_start(delta_list); + memmove(delta_zone->memory + destination, + delta_zone->memory + source, + get_delta_list_byte_size(delta_list)); + } + } else { + /* + * There is more than one list. Divide the problem in half, + * and use recursive calls to process each half. Note that + * after this computation, first <= middle, and middle < last. + */ + unsigned int middle = (first + last) / 2; + + delta_list = &delta_zone->delta_lists[middle]; + new_start = delta_zone->new_offsets[middle]; + + /* + * The direction that our middle list is moving determines + * which half of the problem must be processed first. + */ + if (new_start > delta_list->start) { + rebalance_delta_zone(delta_zone, middle + 1, last); + rebalance_delta_zone(delta_zone, first, middle); + } else { + rebalance_delta_zone(delta_zone, first, middle); + rebalance_delta_zone(delta_zone, middle + 1, last); + } + } +} + +/* Move the start of the delta list bit stream without moving the end. */ +static INLINE void move_delta_list_start(struct delta_list *delta_list, + int increment) +{ + delta_list->start += increment; + delta_list->size -= increment; +} + +/* Move the end of the delta list bit stream without moving the start. */ +static INLINE void move_delta_list_end(struct delta_list *delta_list, + int increment) +{ + delta_list->size += increment; +} + +static INLINE size_t get_zone_memory_size(unsigned int zone_count, + size_t memory_size) +{ + size_t zone_size = memory_size / zone_count; + + /* Round up so that each zone is a multiple of 64K in size. */ + enum { + ALLOC_BOUNDARY = 64 * KILOBYTE, + }; + + return (zone_size + ALLOC_BOUNDARY - 1) & -ALLOC_BOUNDARY; +} + +static void empty_delta_lists(struct delta_zone *delta_zone) +{ + uint64_t list_bits; + uint64_t spacing; + uint64_t offset; + unsigned int i; + struct delta_list *delta_lists = delta_zone->delta_lists; + + /* + * Initialize delta lists to be empty. We keep 2 extra delta list + * descriptors, one before the first real entry and one after so that + * we don't need to bounds check the array access when calculating + * preceeding and following gap sizes. + * + * Because the delta list headers are zeroed, the head guard list will + * already be at offset zero and size zero. + * + * The end guard list contains guard bytes so that get_field() and + * get_big_field() can safely read past the end of any byte we are + * interested in. + */ + + /* Zero all the delta list headers. */ + memset(delta_lists, + 0, + (delta_zone->list_count + 2) * sizeof(struct delta_list)); + + /* Set all the bits in the end guard list. */ + list_bits = (uint64_t) delta_zone->size * CHAR_BIT - GUARD_BITS; + delta_lists[delta_zone->list_count + 1].start = list_bits; + delta_lists[delta_zone->list_count + 1].size = GUARD_BITS; + memset(delta_zone->memory + (list_bits / CHAR_BIT), + ~0, + POST_FIELD_GUARD_BYTES); + + /* Evenly space out the real delta lists by setting regular offsets. */ + spacing = list_bits / delta_zone->list_count; + offset = spacing / 2; + for (i = 1; i <= delta_zone->list_count; i++) { + delta_lists[i].start = offset; + offset += spacing; + } + + /* Update the statistics. */ + delta_zone->discard_count += delta_zone->record_count; + delta_zone->record_count = 0; + delta_zone->collision_count = 0; +} + +void empty_delta_index(const struct delta_index *delta_index) +{ + unsigned int z; + + for (z = 0; z < delta_index->zone_count; z++) { + empty_delta_lists(&delta_index->delta_zones[z]); + } +} + +void empty_delta_zone(const struct delta_index *delta_index, + unsigned int zone_number) +{ + empty_delta_lists(&delta_index->delta_zones[zone_number]); +} + +/* Compute the Huffman coding parameters for the given mean delta. */ +static void compute_coding_constants(unsigned int mean_delta, + unsigned short *min_bits, + unsigned int *min_keys, + unsigned int *incr_keys) +{ + /* + * We want to compute the rounded value of log(2) * mean_delta. Since + * we cannot always use floating point, use a really good integer + * approximation. + */ + *incr_keys = (836158UL * mean_delta + 603160UL) / 1206321UL; + *min_bits = compute_bits(*incr_keys + 1); + *min_keys = (1 << *min_bits) - *incr_keys; +} + +static void uninitialize_delta_zone(struct delta_zone *delta_zone) +{ + UDS_FREE(delta_zone->new_offsets); + delta_zone->new_offsets = NULL; + UDS_FREE(delta_zone->delta_lists); + delta_zone->delta_lists = NULL; + UDS_FREE(delta_zone->memory); + delta_zone->memory = NULL; +} + +void uninitialize_delta_index(struct delta_index *delta_index) +{ + unsigned int z; + + if (delta_index->delta_zones == NULL) { + return; + } + + for (z = 0; z < delta_index->zone_count; z++) { + uninitialize_delta_zone(&delta_index->delta_zones[z]); + } + + UDS_FREE(delta_index->delta_zones); + memset(delta_index, 0, sizeof(struct delta_index)); +} + +static int initialize_delta_zone(struct delta_zone *delta_zone, + size_t size, + unsigned int first_list, + unsigned int list_count, + unsigned int mean_delta, + unsigned int payload_bits) +{ + int result; + + result = UDS_ALLOCATE(size, byte, "delta list", &delta_zone->memory); + if (result != UDS_SUCCESS) { + return result; + } + + result = UDS_ALLOCATE(list_count + 2, + uint64_t, + "delta list temp", + &delta_zone->new_offsets); + if (result != UDS_SUCCESS) { + uninitialize_delta_zone(delta_zone); + return result; + } + + /* Allocate the delta lists. */ + result = UDS_ALLOCATE(list_count + 2, + struct delta_list, + "delta lists", + &delta_zone->delta_lists); + if (result != UDS_SUCCESS) { + uninitialize_delta_zone(delta_zone); + return result; + } + + compute_coding_constants(mean_delta, + &delta_zone->min_bits, + &delta_zone->min_keys, + &delta_zone->incr_keys); + delta_zone->value_bits = payload_bits; + delta_zone->buffered_writer = NULL; + delta_zone->size = size; + delta_zone->rebalance_time = 0; + delta_zone->rebalance_count = 0; + delta_zone->record_count = 0; + delta_zone->collision_count = 0; + delta_zone->discard_count = 0; + delta_zone->overflow_count = 0; + delta_zone->first_list = first_list; + delta_zone->list_count = list_count; + delta_zone->tag = 'm'; + + empty_delta_lists(delta_zone); + return UDS_SUCCESS; +} + +int initialize_delta_index(struct delta_index *delta_index, + unsigned int zone_count, + unsigned int list_count, + unsigned int mean_delta, + unsigned int payload_bits, + size_t memory_size) +{ + int result; + unsigned int z; + size_t zone_memory; + + result = UDS_ALLOCATE(zone_count, + struct delta_zone, + "Delta Index Zones", + &delta_index->delta_zones); + if (result != UDS_SUCCESS) { + return result; + } + + delta_index->zone_count = zone_count; + delta_index->list_count = list_count; + delta_index->lists_per_zone = DIV_ROUND_UP(list_count, zone_count); + delta_index->mutable = true; + delta_index->tag = 'm'; + + for (z = 0; z < zone_count; z++) { + unsigned int lists_in_zone = delta_index->lists_per_zone; + unsigned int first_list_in_zone = z * lists_in_zone; + + if (z == zone_count - 1) { + /* + * The last zone gets fewer lists if zone_count doesn't + * evenly divide list_count. We'll have an underflow if + * the assertion below doesn't hold. + */ + if (delta_index->list_count <= first_list_in_zone) { + uninitialize_delta_index(delta_index); + return uds_log_error_strerror(UDS_INVALID_ARGUMENT, + "%u delta lists not enough for %u zones", + list_count, + zone_count); + } + lists_in_zone = + delta_index->list_count - first_list_in_zone; + } + + zone_memory = get_zone_memory_size(zone_count, memory_size); + result = initialize_delta_zone(&delta_index->delta_zones[z], + zone_memory, + first_list_in_zone, + lists_in_zone, + mean_delta, + payload_bits); + if (result != UDS_SUCCESS) { + uninitialize_delta_index(delta_index); + return result; + } + } + + return UDS_SUCCESS; +} + +/* Read a bit field from an arbitrary bit boundary. */ +static INLINE unsigned int +get_field(const byte *memory, uint64_t offset, int size) +{ + const void *addr = memory + offset / CHAR_BIT; + + return ((get_unaligned_le32(addr) >> (offset % CHAR_BIT)) & + ((1 << size) - 1)); +} + +/* Write a bit field to an arbitrary bit boundary. */ +static INLINE void +set_field(unsigned int value, byte *memory, uint64_t offset, int size) +{ + void *addr = memory + offset / CHAR_BIT; + int shift = offset % CHAR_BIT; + uint32_t data = get_unaligned_le32(addr); + + data &= ~(((1 << size) - 1) << shift); + data |= value << shift; + put_unaligned_le32(data, addr); +} + +/* Get the bit offset to the immutable delta list header. */ +static INLINE unsigned int get_immutable_header_offset(unsigned int list_number) +{ + return (sizeof(struct delta_page_header) * CHAR_BIT + + list_number * IMMUTABLE_HEADER_SIZE); +} + +/* Get the bit offset to the start of the immutable delta list bit stream. */ +static INLINE unsigned int get_immutable_start(const byte *memory, + unsigned int list_number) +{ + return get_field(memory, + get_immutable_header_offset(list_number), + IMMUTABLE_HEADER_SIZE); +} + +/* Set the bit offset to the start of the immutable delta list bit stream. */ +static INLINE void set_immutable_start(byte *memory, + unsigned int list_number, + unsigned int start) +{ + set_field(start, + memory, + get_immutable_header_offset(list_number), + IMMUTABLE_HEADER_SIZE); +} + +static bool verify_delta_index_page(uint64_t nonce, + uint16_t list_count, + uint64_t expected_nonce, + byte *memory, + size_t memory_size) +{ + unsigned int i; + + /* + * Verify the nonce. A mismatch can happen here during rebuild if we + * haven't written the entire volume at least once. + */ + if (nonce != expected_nonce) { + return false; + } + + /* Verify that the number of delta lists can fit in the page. */ + if (list_count > ((memory_size - sizeof(struct delta_page_header)) * + CHAR_BIT / IMMUTABLE_HEADER_SIZE)) { + return false; + } + + /* + * Verify that the first delta list is immediately after the last delta + * list header. + */ + if (get_immutable_start(memory, 0) != + get_immutable_header_offset(list_count + 1)) { + return false; + } + + /* Verify that the lists are in the correct order. */ + for (i = 0; i < list_count; i++) { + if (get_immutable_start(memory, i) > + get_immutable_start(memory, i + 1)) { + return false; + } + } + + /* + * Verify that the last list ends on the page, and that there is room + * for the post-field guard bits. + */ + if (get_immutable_start(memory, list_count) > + (memory_size - POST_FIELD_GUARD_BYTES) * CHAR_BIT) { + return false; + } + + /* Verify that the guard bytes are correctly set to all ones. */ + for (i = 0; i < POST_FIELD_GUARD_BYTES; i++) { + byte guard_byte; + + guard_byte = memory[memory_size - POST_FIELD_GUARD_BYTES + i]; + if (guard_byte != (byte) ~0) { + return false; + } + } + + /* All verifications passed. */ + return true; +} + +static void +initialize_delta_zone_page(struct delta_zone *delta_zone, + byte *memory, + size_t size, + unsigned int list_count, + unsigned int mean_delta, + unsigned int payload_bits) +{ + compute_coding_constants(mean_delta, + &delta_zone->min_bits, + &delta_zone->min_keys, + &delta_zone->incr_keys); + delta_zone->value_bits = payload_bits; + delta_zone->memory = memory; + delta_zone->delta_lists = NULL; + delta_zone->new_offsets = NULL; + delta_zone->buffered_writer = NULL; + delta_zone->size = size; + delta_zone->rebalance_time = 0; + delta_zone->rebalance_count = 0; + delta_zone->record_count = 0; + delta_zone->collision_count = 0; + delta_zone->discard_count = 0; + delta_zone->overflow_count = 0; + delta_zone->first_list = 0; + delta_zone->list_count = list_count; + delta_zone->tag = 'p'; +} + +/* Initialize a delta index page to refer to a supplied page. */ +int initialize_delta_index_page(struct delta_index_page *delta_index_page, + uint64_t expected_nonce, + unsigned int mean_delta, + unsigned int payload_bits, + byte *memory, + size_t memory_size) +{ + uint64_t nonce; + uint64_t vcn; + uint64_t first_list; + uint64_t list_count; + struct delta_page_header *header = (struct delta_page_header *) memory; + const byte *nonce_addr = (const byte *) &header->nonce; + const byte *vcn_addr = (const byte *) &header->virtual_chapter_number; + const byte *first_list_addr = (const byte *) &header->first_list; + const byte *list_count_addr = (const byte *) &header->list_count; + + /* First assume that the header is little endian. */ + nonce = get_unaligned_le64(nonce_addr); + vcn = get_unaligned_le64(vcn_addr); + first_list = get_unaligned_le16(first_list_addr); + list_count = get_unaligned_le16(list_count_addr); + if (!verify_delta_index_page(nonce, + list_count, + expected_nonce, + memory, + memory_size)) { + /* If that fails, try big endian. */ + nonce = get_unaligned_be64(nonce_addr); + vcn = get_unaligned_be64(vcn_addr); + first_list = get_unaligned_be16(first_list_addr); + list_count = get_unaligned_be16(list_count_addr); + if (!verify_delta_index_page(nonce, + list_count, + expected_nonce, + memory, + memory_size)) { + /* + * Both attempts failed. Do not log this as an error, + * because it can happen during a rebuild if we haven't + * written the entire volume at least once. + */ + return UDS_CORRUPT_DATA; + } + } + + delta_index_page->delta_index.delta_zones = + &delta_index_page->delta_zone; + delta_index_page->delta_index.zone_count = 1; + delta_index_page->delta_index.list_count = list_count; + delta_index_page->delta_index.lists_per_zone = list_count; + delta_index_page->delta_index.mutable = false; + delta_index_page->delta_index.tag = 'p'; + delta_index_page->virtual_chapter_number = vcn; + delta_index_page->lowest_list_number = first_list; + delta_index_page->highest_list_number = first_list + list_count - 1; + + initialize_delta_zone_page(&delta_index_page->delta_zone, + memory, + memory_size, + list_count, + mean_delta, + payload_bits); + return UDS_SUCCESS; +} + +/* Read a large bit field from an arbitrary bit boundary. */ +static INLINE uint64_t get_big_field(const byte *memory, + uint64_t offset, + int size) +{ + const void *addr = memory + offset / CHAR_BIT; + + return ((get_unaligned_le64(addr) >> (offset % CHAR_BIT)) & + ((1UL << size) - 1)); +} + +/* Write a large bit field to an arbitrary bit boundary. */ +static INLINE void +set_big_field(uint64_t value, byte *memory, uint64_t offset, int size) +{ + void *addr = memory + offset / CHAR_BIT; + int shift = offset % CHAR_BIT; + uint64_t data = get_unaligned_le64(addr); + + data &= ~(((1UL << size) - 1) << shift); + data |= value << shift; + put_unaligned_le64(data, addr); +} + +/* Set a sequence of bits to all zeros. */ +static INLINE void set_zero(byte *memory, uint64_t offset, int size) +{ + if (size > 0) { + byte *addr = memory + offset / CHAR_BIT; + int shift = offset % CHAR_BIT; + int count = size + shift > CHAR_BIT ? CHAR_BIT - shift : size; + + *addr++ &= ~(((1 << count) - 1) << shift); + for (size -= count; size > CHAR_BIT; size -= CHAR_BIT) { + *addr++ = 0; + } + + if (size > 0) { + *addr &= 0xFF << size; + } + } +} + +/* + * Move several bits from a higher to a lower address, moving the lower + * addressed bits first. The size and memory offsets are measured in bits. + */ +static void move_bits_down(const byte *from, + uint64_t from_offset, + byte *to, + uint64_t to_offset, + int size) +{ + const byte *source; + byte *destination; + int offset; + int count; + uint64_t field; + + /* Start by moving one field that ends on a to int boundary. */ + count = (MAX_BIG_FIELD_BITS - + ((to_offset + MAX_BIG_FIELD_BITS) % UINT32_BITS)); + field = get_big_field(from, from_offset, count); + set_big_field(field, to, to_offset, count); + from_offset += count; + to_offset += count; + size -= count; + + /* + * Now do the main loop to copy 32 bit chunks that are int-aligned at + * the destination. + */ + offset = from_offset % UINT32_BITS; + source = from + (from_offset - offset) / CHAR_BIT; + destination = to + to_offset / CHAR_BIT; + while (size > MAX_BIG_FIELD_BITS) { + put_unaligned_le32(get_unaligned_le64(source) >> offset, + destination); + source += sizeof(uint32_t); + destination += sizeof(uint32_t); + from_offset += UINT32_BITS; + to_offset += UINT32_BITS; + size -= UINT32_BITS; + } + + /* Finish up by moving any remaining bits. */ + if (size > 0) { + field = get_big_field(from, from_offset, size); + set_big_field(field, to, to_offset, size); + } +} + +/* + * Move several bits from a lower to a higher address, moving the higher + * addressed bits first. The size and memory offsets are measured in bits. + */ +static void move_bits_up(const byte *from, + uint64_t from_offset, + byte *to, + uint64_t to_offset, + int size) +{ + const byte *source; + byte *destination; + int offset; + int count; + uint64_t field; + + /* Start by moving one field that begins on a destination int boundary. */ + count = (to_offset + size) % UINT32_BITS; + if (count > 0) { + size -= count; + field = get_big_field(from, from_offset + size, count); + set_big_field(field, to, to_offset + size, count); + } + + /* + * Now do the main loop to copy 32 bit chunks that are int-aligned at + * the destination. + */ + offset = (from_offset + size) % UINT32_BITS; + source = from + (from_offset + size - offset) / CHAR_BIT; + destination = to + (to_offset + size) / CHAR_BIT; + while (size > MAX_BIG_FIELD_BITS) { + source -= sizeof(uint32_t); + destination -= sizeof(uint32_t); + size -= UINT32_BITS; + put_unaligned_le32(get_unaligned_le64(source) >> offset, + destination); + } + + /* Finish up by moving any remaining bits. */ + if (size > 0) { + field = get_big_field(from, from_offset, size); + set_big_field(field, to, to_offset, size); + } +} + +/* + * Move bits from one field to another. When the fields overlap, behave as if + * we first move all the bits from the source to a temporary value, and then + * move all the bits from the temporary value to the destination. The size and + * memory offsets are measured in bits. + */ +static void move_bits(const byte *from, + uint64_t from_offset, + byte *to, + uint64_t to_offset, + int size) +{ + uint64_t field; + + /* A small move doesn't require special handling. */ + if (size <= MAX_BIG_FIELD_BITS) { + if (size > 0) { + field = get_big_field(from, from_offset, size); + set_big_field(field, to, to_offset, size); + } + + return; + } + + if (from_offset > to_offset) { + move_bits_down(from, from_offset, to, to_offset, size); + } else { + move_bits_up(from, from_offset, to, to_offset, size); + } +} + +/** + * Pack delta lists from a mutable delta index into an immutable delta index + * page. A range of delta lists (starting with a specified list index) is + * copied from the mutable delta index into a memory page used in the immutable + * index. The number of lists copied onto the page is returned in list_count. + **/ +int pack_delta_index_page(const struct delta_index *delta_index, + uint64_t header_nonce, + byte *memory, + size_t memory_size, + uint64_t virtual_chapter_number, + unsigned int first_list, + unsigned int *list_count) +{ + const struct delta_zone *delta_zone; + struct delta_list *delta_lists; + unsigned int max_lists; + unsigned int n_lists = 0; + unsigned int offset; + unsigned int i; + int free_bits; + int bits; + struct delta_page_header *header; + + delta_zone = &delta_index->delta_zones[0]; + delta_lists = &delta_zone->delta_lists[first_list + 1]; + max_lists = delta_index->list_count - first_list; + + /* + * Compute how many lists will fit on the page. Subtract the size of + * the fixed header, one delta list offset, and the guard bytes from + * the page size to determine how much space is available for delta + * lists. + */ + free_bits = memory_size * CHAR_BIT; + free_bits -= get_immutable_header_offset(1); + free_bits -= POST_FIELD_GUARD_BYTES * CHAR_BIT; + if (free_bits < IMMUTABLE_HEADER_SIZE) { + /* This page is too small to store any delta lists. */ + return uds_log_error_strerror(UDS_OVERFLOW, + "Chapter Index Page of %zu bytes is too small", + memory_size); + } + + while (n_lists < max_lists) { + /* Each list requires a delta list offset and the list data. */ + bits = IMMUTABLE_HEADER_SIZE + delta_lists[n_lists].size; + if (bits > free_bits) { + break; + } + + n_lists++; + free_bits -= bits; + } + + *list_count = n_lists; + + header = (struct delta_page_header *) memory; + put_unaligned_le64(header_nonce, (byte *) &header->nonce); + put_unaligned_le64(virtual_chapter_number, + (byte *) &header->virtual_chapter_number); + put_unaligned_le16(first_list, (byte *) &header->first_list); + put_unaligned_le16(n_lists, (byte *) &header->list_count); + + /* Construct the delta list offset table. */ + offset = get_immutable_header_offset(n_lists + 1); + set_immutable_start(memory, 0, offset); + for (i = 0; i < n_lists; i++) { + offset += delta_lists[i].size; + set_immutable_start(memory, i + 1, offset); + } + + /* Copy the delta list data onto the memory page. */ + for (i = 0; i < n_lists; i++) { + move_bits(delta_zone->memory, + delta_lists[i].start, + memory, + get_immutable_start(memory, i), + delta_lists[i].size); + } + + /* Set all the bits in the guard bytes. */ + memset(memory + memory_size - POST_FIELD_GUARD_BYTES, + ~0, + POST_FIELD_GUARD_BYTES); + return UDS_SUCCESS; +} + +void set_delta_index_tag(struct delta_index *delta_index, byte tag) +{ + unsigned int z; + + delta_index->tag = tag; + for (z = 0; z < delta_index->zone_count; z++) { + delta_index->delta_zones[z].tag = tag; + } +} + +static int __must_check +decode_delta_index_header(struct buffer *buffer, + struct delta_index_header *header) +{ + int result; + + result = get_bytes_from_buffer(buffer, MAGIC_SIZE, &header->magic); + if (result != UDS_SUCCESS) { + return result; + } + + result = get_uint32_le_from_buffer(buffer, &header->zone_number); + if (result != UDS_SUCCESS) { + return result; + } + + result = get_uint32_le_from_buffer(buffer, &header->zone_count); + if (result != UDS_SUCCESS) { + return result; + } + + result = get_uint32_le_from_buffer(buffer, &header->first_list); + if (result != UDS_SUCCESS) { + return result; + } + + result = get_uint32_le_from_buffer(buffer, &header->list_count); + if (result != UDS_SUCCESS) { + return result; + } + + result = get_uint64_le_from_buffer(buffer, &header->record_count); + if (result != UDS_SUCCESS) { + return result; + } + + result = get_uint64_le_from_buffer(buffer, &header->collision_count); + if (result != UDS_SUCCESS) { + return result; + } + + result = ASSERT_LOG_ONLY(content_length(buffer) == 0, + "%zu bytes decoded of %zu expected", + buffer_length(buffer) - content_length(buffer), + buffer_length(buffer)); + return result; +} + +static int __must_check +read_delta_index_header(struct buffered_reader *reader, + struct delta_index_header *header) +{ + int result; + struct buffer *buffer; + + result = make_buffer(sizeof(*header), &buffer); + if (result != UDS_SUCCESS) { + return result; + } + + result = read_from_buffered_reader(reader, + get_buffer_contents(buffer), + buffer_length(buffer)); + if (result != UDS_SUCCESS) { + free_buffer(UDS_FORGET(buffer)); + return uds_log_warning_strerror(result, + "failed to read delta index header"); + } + + result = reset_buffer_end(buffer, buffer_length(buffer)); + if (result != UDS_SUCCESS) { + free_buffer(UDS_FORGET(buffer)); + return result; + } + + result = decode_delta_index_header(buffer, header); + free_buffer(UDS_FORGET(buffer)); + return result; +} + +/* Compute the new offsets of the delta lists. */ +static void compute_new_list_offsets(struct delta_zone *delta_zone, + unsigned int growing_index, + size_t growing_size, + size_t used_space) +{ + size_t spacing; + unsigned int i; + struct delta_list *delta_lists = delta_zone->delta_lists; + unsigned int tail_guard_index = delta_zone->list_count + 1; + + spacing = (delta_zone->size - used_space) / delta_zone->list_count; + delta_zone->new_offsets[0] = 0; + for (i = 0; i <= delta_zone->list_count; i++) { + delta_zone->new_offsets[i + 1] = + (delta_zone->new_offsets[i] + + get_delta_list_byte_size(&delta_lists[i]) + spacing); + delta_zone->new_offsets[i] *= CHAR_BIT; + delta_zone->new_offsets[i] += + delta_lists[i].start % CHAR_BIT; + if (i == 0) { + delta_zone->new_offsets[i + 1] -= spacing / 2; + } + if (i + 1 == growing_index) { + delta_zone->new_offsets[i + 1] += growing_size; + } + } + + delta_zone->new_offsets[tail_guard_index] = + (delta_zone->size * CHAR_BIT - + delta_lists[tail_guard_index].size); +} + +static void rebalance_lists(struct delta_zone *delta_zone) +{ + struct delta_list *delta_lists; + unsigned int i; + size_t used_space = 0; + + /* Extend and balance memory to receive the delta lists */ + delta_lists = delta_zone->delta_lists; + for (i = 0; i <= delta_zone->list_count + 1; i++) { + used_space += get_delta_list_byte_size(&delta_lists[i]); + } + + compute_new_list_offsets(delta_zone, 0, 0, used_space); + for (i = 1; i <= delta_zone->list_count + 1; i++) { + delta_lists[i].start = delta_zone->new_offsets[i]; + } +} + +/* Start restoring a delta index from multiple input streams. */ +int start_restoring_delta_index(struct delta_index *delta_index, + struct buffered_reader **buffered_readers, + unsigned int reader_count) +{ + int result; + unsigned int zone_count = reader_count; + unsigned long record_count = 0; + unsigned long collision_count = 0; + unsigned int first_list[MAX_ZONES]; + unsigned int list_count[MAX_ZONES]; + unsigned int z; + unsigned int list_next = 0; + const struct delta_zone *delta_zone; + + /* Read and validate each header. */ + for (z = 0; z < zone_count; z++) { + struct delta_index_header header; + + result = read_delta_index_header(buffered_readers[z], &header); + if (result != UDS_SUCCESS) { + return uds_log_warning_strerror(result, + "failed to read delta index header"); + } + + if (memcmp(header.magic, DELTA_INDEX_MAGIC, MAGIC_SIZE) != 0) { + return uds_log_warning_strerror(UDS_CORRUPT_DATA, + "delta index file has bad magic number"); + } + + if (zone_count != header.zone_count) { + return uds_log_warning_strerror(UDS_CORRUPT_DATA, + "delta index files contain mismatched zone counts (%u,%u)", + zone_count, + header.zone_count); + } + + if (header.zone_number >= zone_count) { + return uds_log_warning_strerror(UDS_CORRUPT_DATA, + "delta index files contains zone %u of %u zones", + header.zone_number, + zone_count); + } + if (header.zone_number != z) { + return uds_log_warning_strerror(UDS_CORRUPT_DATA, + "delta index zone %u found in slot %u", + header.zone_number, + z); + } + + first_list[z] = header.first_list; + list_count[z] = header.list_count; + record_count += header.record_count; + collision_count += header.collision_count; + + if (first_list[z] != list_next) { + return uds_log_warning_strerror(UDS_CORRUPT_DATA, + "delta index file for zone %u starts with list %u instead of list %u", + z, + first_list[z], + list_next); + } + + list_next += list_count[z]; + } + + if (list_next != delta_index->list_count) { + return uds_log_warning_strerror(UDS_CORRUPT_DATA, + "delta index files contain %u delta lists instead of %u delta lists", + list_next, + delta_index->list_count); + } + + if (collision_count > record_count) { + return uds_log_warning_strerror(UDS_CORRUPT_DATA, + "delta index files contain %ld collisions and %ld records", + collision_count, + record_count); + } + + empty_delta_index(delta_index); + delta_index->delta_zones[0].record_count = record_count; + delta_index->delta_zones[0].collision_count = collision_count; + + /* Read the delta lists and distribute them to the proper zones. */ + for (z = 0; z < zone_count; z++) { + unsigned int i; + + delta_index->load_lists[z] = 0; + for (i = 0; i < list_count[z]; i++) { + uint16_t delta_list_size; + unsigned int list_number; + unsigned int zone_number; + byte size_data[sizeof(uint16_t)]; + + result = read_from_buffered_reader(buffered_readers[z], + size_data, + sizeof(size_data)); + if (result != UDS_SUCCESS) { + return uds_log_warning_strerror(result, + "failed to read delta index size"); + } + + delta_list_size = get_unaligned_le16(size_data); + if (delta_list_size > 0) { + delta_index->load_lists[z] += 1; + } + + list_number = first_list[z] + i; + zone_number = get_delta_zone_number(delta_index, + list_number); + delta_zone = &delta_index->delta_zones[zone_number]; + list_number -= delta_zone->first_list; + delta_zone->delta_lists[list_number + 1].size = + delta_list_size; + } + } + + /* Prepare each zone to start receiving the delta list data. */ + for (z = 0; z < delta_index->zone_count; z++) { + rebalance_lists(&delta_index->delta_zones[z]); + } + return UDS_SUCCESS; +} + +static int +restore_delta_list_to_zone(struct delta_zone *delta_zone, + const struct delta_list_save_info *save_info, + const byte *data) +{ + struct delta_list *delta_list; + unsigned int bit_count; + unsigned int byte_count; + unsigned int list_number = save_info->index - delta_zone->first_list; + + if (list_number >= delta_zone->list_count) { + return uds_log_warning_strerror(UDS_CORRUPT_DATA, + "invalid delta list number %u not in range [%u,%u)", + save_info->index, + delta_zone->first_list, + delta_zone->first_list + + delta_zone->list_count); + } + + delta_list = &delta_zone->delta_lists[list_number + 1]; + if (delta_list->size == 0) { + return uds_log_warning_strerror(UDS_CORRUPT_DATA, + "unexpected delta list number %u", + save_info->index); + } + + bit_count = (unsigned int) save_info->bit_offset + delta_list->size; + byte_count = DIV_ROUND_UP(bit_count, CHAR_BIT); + if (save_info->byte_count != byte_count) { + return uds_log_warning_strerror(UDS_CORRUPT_DATA, + "unexpected delta list size %u != %u", + save_info->byte_count, + byte_count); + } + + move_bits(data, + save_info->bit_offset, + delta_zone->memory, + delta_list->start, + delta_list->size); + return UDS_SUCCESS; +} + +static int __must_check +read_delta_list_save_info(struct buffered_reader *reader, + struct delta_list_save_info *save_info) +{ + int result; + byte buffer[sizeof(struct delta_list_save_info)]; + + result = read_from_buffered_reader(reader, buffer, sizeof(buffer)); + if (result != UDS_SUCCESS) { + return result; + } + + save_info->tag = buffer[0]; + save_info->bit_offset = buffer[1]; + save_info->byte_count = get_unaligned_le16(&buffer[2]); + save_info->index = get_unaligned_le32(&buffer[4]); + return result; +} + +static int read_saved_delta_list(struct delta_list_save_info *save_info, + struct buffered_reader *buffered_reader) +{ + int result; + + result = read_delta_list_save_info(buffered_reader, save_info); + if (result != UDS_SUCCESS) { + return uds_log_warning_strerror(result, + "failed to read delta list data"); + } + + if ((save_info->bit_offset >= CHAR_BIT) || + (save_info->byte_count > DELTA_LIST_MAX_BYTE_COUNT)) { + return uds_log_warning_strerror(UDS_CORRUPT_DATA, + "corrupt delta list data"); + } + + return UDS_SUCCESS; +} + +static int restore_delta_list_data(struct delta_index *delta_index, + unsigned int load_zone, + struct buffered_reader *buffered_reader, + byte *data) +{ + int result; + struct delta_list_save_info save_info = { 0 }; + unsigned int new_zone; + + result = read_saved_delta_list(&save_info, buffered_reader); + if (result != UDS_SUCCESS) { + return result; + } + + /* Make sure the data is intended for this delta index. */ + if (save_info.tag != delta_index->tag) { + return UDS_CORRUPT_DATA; + } + + if (save_info.index >= delta_index->list_count) { + return uds_log_warning_strerror(UDS_CORRUPT_DATA, + "invalid delta list number %u of %u", + save_info.index, + delta_index->list_count); + } + + result = read_from_buffered_reader(buffered_reader, data, + save_info.byte_count); + if (result != UDS_SUCCESS) { + return uds_log_warning_strerror(result, + "failed to read delta list data"); + } + + delta_index->load_lists[load_zone] -= 1; + new_zone = get_delta_zone_number(delta_index, save_info.index); + return restore_delta_list_to_zone(&delta_index->delta_zones[new_zone], + &save_info, + data); +} + +/* Restore delta lists from saved data. */ +int finish_restoring_delta_index(struct delta_index *delta_index, + struct buffered_reader **buffered_readers, + unsigned int reader_count) +{ + int result; + int saved_result = UDS_SUCCESS; + unsigned int z; + byte *data; + + result = UDS_ALLOCATE(DELTA_LIST_MAX_BYTE_COUNT, + byte, + __func__, + &data); + if (result != UDS_SUCCESS) { + return result; + } + + for (z = 0; z < reader_count; z++) { + while (delta_index->load_lists[z] > 0) { + result = restore_delta_list_data(delta_index, + z, + buffered_readers[z], + data); + if (result != UDS_SUCCESS) { + saved_result = result; + break; + } + } + } + + UDS_FREE(data); + return saved_result; +} + +void abort_restoring_delta_index(const struct delta_index *delta_index) +{ + unsigned int z; + + for (z = 0; z < delta_index->zone_count; z++) { + empty_delta_lists(&delta_index->delta_zones[z]); + } +} + +int check_guard_delta_lists(struct buffered_reader **buffered_readers, + unsigned int reader_count) +{ + int result; + unsigned int z; + struct delta_list_save_info save_info; + + for (z = 0; z < reader_count; z++) { + result = read_delta_list_save_info(buffered_readers[z], + &save_info); + if (result != UDS_SUCCESS) { + return result; + } + + if (save_info.tag != 'z') { + return UDS_CORRUPT_DATA; + } + } + + return UDS_SUCCESS; +} + +static int __must_check +encode_delta_index_header(struct buffer *buffer, + struct delta_index_header *header) +{ + int result; + + result = put_bytes(buffer, MAGIC_SIZE, DELTA_INDEX_MAGIC); + if (result != UDS_SUCCESS) { + return result; + } + + result = put_uint32_le_into_buffer(buffer, header->zone_number); + if (result != UDS_SUCCESS) { + return result; + } + + result = put_uint32_le_into_buffer(buffer, header->zone_count); + if (result != UDS_SUCCESS) { + return result; + } + + result = put_uint32_le_into_buffer(buffer, header->first_list); + if (result != UDS_SUCCESS) { + return result; + } + + result = put_uint32_le_into_buffer(buffer, header->list_count); + if (result != UDS_SUCCESS) { + return result; + } + + result = put_uint64_le_into_buffer(buffer, header->record_count); + if (result != UDS_SUCCESS) { + return result; + } + + result = put_uint64_le_into_buffer(buffer, header->collision_count); + if (result != UDS_SUCCESS) { + return result; + } + + return ASSERT_LOG_ONLY(content_length(buffer) == sizeof(*header), + "%zu bytes encoded of %zu expected", + content_length(buffer), + sizeof(*header)); +} + +static int __must_check +write_delta_list_save_info(struct buffered_writer *buffered_writer, + struct delta_list_save_info *save_info) +{ + byte buffer[sizeof(struct delta_list_save_info)]; + + buffer[0] = save_info->tag; + buffer[1] = save_info->bit_offset; + put_unaligned_le16(save_info->byte_count, &buffer[2]); + put_unaligned_le32(save_info->index, &buffer[4]); + return write_to_buffered_writer(buffered_writer, buffer, + sizeof(buffer)); +} + +static int flush_delta_list(struct delta_zone *delta_zone, + unsigned int flush_index) +{ + struct delta_list *delta_list; + struct delta_list_save_info save_info; + int result; + + delta_list = &delta_zone->delta_lists[flush_index + 1]; + save_info.tag = delta_zone->tag; + save_info.bit_offset = delta_list->start % CHAR_BIT; + save_info.byte_count = get_delta_list_byte_size(delta_list); + save_info.index = delta_zone->first_list + flush_index; + + result = write_delta_list_save_info(delta_zone->buffered_writer, + &save_info); + if (result != UDS_SUCCESS) { + uds_log_warning_strerror(result, + "failed to write delta list memory"); + return result; + } + + result = write_to_buffered_writer(delta_zone->buffered_writer, + delta_zone->memory + get_delta_list_byte_start(delta_list), + save_info.byte_count); + if (result != UDS_SUCCESS) { + uds_log_warning_strerror(result, + "failed to write delta list memory"); + } + + return result; +} + +/* Start saving a delta index zone to a buffered output stream. */ +int start_saving_delta_index(const struct delta_index *delta_index, + unsigned int zone_number, + struct buffered_writer *buffered_writer) +{ + int result; + unsigned int i; + struct buffer *buffer; + struct delta_zone *delta_zone; + struct delta_index_header header; + + delta_zone = &delta_index->delta_zones[zone_number]; + memcpy(header.magic, DELTA_INDEX_MAGIC, MAGIC_SIZE); + header.zone_number = zone_number; + header.zone_count = delta_index->zone_count; + header.first_list = delta_zone->first_list; + header.list_count = delta_zone->list_count; + header.record_count = delta_zone->record_count; + header.collision_count = delta_zone->collision_count; + + result = make_buffer(sizeof(struct delta_index_header), &buffer); + if (result != UDS_SUCCESS) { + return result; + } + + result = encode_delta_index_header(buffer, &header); + if (result != UDS_SUCCESS) { + free_buffer(UDS_FORGET(buffer)); + return result; + } + + result = write_to_buffered_writer(buffered_writer, + get_buffer_contents(buffer), + content_length(buffer)); + free_buffer(UDS_FORGET(buffer)); + if (result != UDS_SUCCESS) { + return uds_log_warning_strerror(result, + "failed to write delta index header"); + } + + for (i = 0; i < delta_zone->list_count; i++) { + byte data[sizeof(uint16_t)]; + struct delta_list *delta_list; + + delta_list = &delta_zone->delta_lists[i + 1]; + put_unaligned_le16(delta_list->size, data); + result = write_to_buffered_writer(buffered_writer, + data, + sizeof(data)); + if (result != UDS_SUCCESS) { + return uds_log_warning_strerror(result, + "failed to write delta list size"); + } + } + + delta_zone->buffered_writer = buffered_writer; + return UDS_SUCCESS; +} + +int finish_saving_delta_index(const struct delta_index *delta_index, + unsigned int zone_number) +{ + int result; + int first_error = UDS_SUCCESS; + unsigned int i; + struct delta_zone *delta_zone; + struct delta_list *delta_list; + + delta_zone = &delta_index->delta_zones[zone_number]; + for (i = 0; i < delta_zone->list_count;i++) { + delta_list = &delta_zone->delta_lists[i + 1]; + if (delta_list->size > 0) { + result = flush_delta_list(delta_zone, i); + if ((result != UDS_SUCCESS) && + (first_error == UDS_SUCCESS)) { + first_error = result; + } + } + } + + delta_zone->buffered_writer = NULL; + return first_error; +} + +int write_guard_delta_list(struct buffered_writer *buffered_writer) +{ + int result; + struct delta_list_save_info save_info; + + save_info.tag = 'z'; + save_info.bit_offset = 0; + save_info.byte_count = 0; + save_info.index = 0; + result = write_to_buffered_writer(buffered_writer, + (const byte *) &save_info, + sizeof(struct delta_list_save_info)); + if (result != UDS_SUCCESS) { + uds_log_warning_strerror(result, + "failed to write guard delta list"); + } + return result; +} + +size_t compute_delta_index_save_bytes(unsigned int list_count, + size_t memory_size) +{ + /* One zone will use at least as much memory as other zone counts. */ + return (sizeof(struct delta_index_header) + + list_count * (sizeof(struct delta_list_save_info) + 1) + + get_zone_memory_size(1, memory_size)); +} + +static int assert_not_at_end(const struct delta_index_entry *delta_entry) +{ + return ASSERT_WITH_ERROR_CODE(!delta_entry->at_end, + UDS_BAD_STATE, + "operation is invalid because the list entry is at the end of the delta list"); +} + +static void prefetch_delta_list(const struct delta_zone *delta_zone, + const struct delta_list *delta_list) +{ + uint64_t memory_offset = delta_list->start / CHAR_BIT; + const byte *addr = &delta_zone->memory[memory_offset]; + unsigned int size = delta_list->size / CHAR_BIT; + + prefetch_range(addr, size, false); +} + +/* + * Prepare to search for an entry in the specified delta list. + * + * This is always the first function to be called when dealing with delta index + * entries. It is always followed by calls to next_delta_index_entry() to + * iterate through a delta list. The fields of the delta_index_entry argument + * will be set up for iteration, but will not contain an entry from the list. + */ +int start_delta_index_search(const struct delta_index *delta_index, + unsigned int list_number, + unsigned int key, + struct delta_index_entry *delta_entry) +{ + int result; + unsigned int zone_number; + struct delta_zone *delta_zone; + struct delta_list *delta_list; + + result = ASSERT_WITH_ERROR_CODE((list_number < delta_index->list_count), + UDS_CORRUPT_DATA, + "Delta list number (%u) is out of range (%u)", + list_number, + delta_index->list_count); + if (result != UDS_SUCCESS) { + return result; + } + + zone_number = get_delta_zone_number(delta_index, list_number); + delta_zone = &delta_index->delta_zones[zone_number]; + list_number -= delta_zone->first_list; + result = ASSERT_WITH_ERROR_CODE((list_number < delta_zone->list_count), + UDS_CORRUPT_DATA, + "Delta list number (%u) is out of range (%u) for zone (%u)", + list_number, + delta_zone->list_count, + zone_number); + if (result != UDS_SUCCESS) { + return result; + } + + if (delta_index->mutable) { + delta_list = &delta_zone->delta_lists[list_number + 1]; + } else { + unsigned int end_offset; + + /* + * Translate the immutable delta list header into a temporary + * full delta list header. + */ + delta_list = &delta_entry->temp_delta_list; + delta_list->start = + get_immutable_start(delta_zone->memory, list_number); + end_offset = get_immutable_start(delta_zone->memory, + list_number + 1); + delta_list->size = end_offset - delta_list->start; + delta_list->save_key = 0; + delta_list->save_offset = 0; + } + + if (key > delta_list->save_key) { + delta_entry->key = delta_list->save_key; + delta_entry->offset = delta_list->save_offset; + } else { + delta_entry->key = 0; + delta_entry->offset = 0; + if (key == 0) { + /* + * This usually means we're about to walk the entire + * delta list, so get all of it into the CPU cache. + */ + prefetch_delta_list(delta_zone, delta_list); + } + } + + delta_entry->at_end = false; + delta_entry->delta_zone = delta_zone; + delta_entry->delta_list = delta_list; + delta_entry->entry_bits = 0; + delta_entry->is_collision = false; + delta_entry->list_number = list_number; + delta_entry->list_overflow = false; + delta_entry->value_bits = delta_zone->value_bits; + return UDS_SUCCESS; +} + +static INLINE uint64_t +get_delta_entry_offset(const struct delta_index_entry *delta_entry) +{ + return delta_entry->delta_list->start + delta_entry->offset; +} + +/* + * Decode a delta index entry delta value. The delta_index_entry basically + * describes the previous list entry, and has had its offset field changed to + * point to the subsequent entry. We decode the bit stream and update the + * delta_list_entry to describe the entry. + */ +static INLINE void decode_delta(struct delta_index_entry *delta_entry) +{ + int key_bits; + unsigned int delta; + const struct delta_zone *delta_zone = delta_entry->delta_zone; + const byte *memory = delta_zone->memory; + uint64_t delta_offset = + get_delta_entry_offset(delta_entry) + delta_entry->value_bits; + const byte *addr = memory + delta_offset / CHAR_BIT; + int offset = delta_offset % CHAR_BIT; + uint32_t data = get_unaligned_le32(addr) >> offset; + + addr += sizeof(uint32_t); + key_bits = delta_zone->min_bits; + delta = data & ((1 << key_bits) - 1); + if (delta >= delta_zone->min_keys) { + data >>= key_bits; + if (data == 0) { + key_bits = sizeof(uint32_t) * CHAR_BIT - offset; + while ((data = get_unaligned_le32(addr)) == 0) { + addr += sizeof(uint32_t); + key_bits += sizeof(uint32_t) * CHAR_BIT; + } + } + key_bits += ffs(data); + delta += ((key_bits - delta_zone->min_bits - 1) * + delta_zone->incr_keys); + } + delta_entry->delta = delta; + delta_entry->key += delta; + + /* Check for a collision, a delta of zero after the start. */ + if (unlikely((delta == 0) && (delta_entry->offset > 0))) { + delta_entry->is_collision = true; + delta_entry->entry_bits = + delta_entry->value_bits + key_bits + COLLISION_BITS; + } else { + delta_entry->is_collision = false; + delta_entry->entry_bits = delta_entry->value_bits + key_bits; + } +} + +noinline int next_delta_index_entry(struct delta_index_entry *delta_entry) +{ + int result; + const struct delta_list *delta_list; + unsigned int next_offset; + unsigned int size; + + result = assert_not_at_end(delta_entry); + if (result != UDS_SUCCESS) { + return result; + } + + delta_list = delta_entry->delta_list; + delta_entry->offset += delta_entry->entry_bits; + size = delta_list->size; + if (unlikely(delta_entry->offset >= size)) { + delta_entry->at_end = true; + delta_entry->delta = 0; + delta_entry->is_collision = false; + return ASSERT_WITH_ERROR_CODE((delta_entry->offset == size), + UDS_CORRUPT_DATA, + "next offset past end of delta list"); + } + + decode_delta(delta_entry); + + next_offset = delta_entry->offset + delta_entry->entry_bits; + if (next_offset > size) { + /* + * This is not an assertion because + * validate_chapter_index_page() wants to handle this error. + */ + uds_log_warning("Decoded past the end of the delta list"); + return UDS_CORRUPT_DATA; + } + + return UDS_SUCCESS; +} + +int remember_delta_index_offset(const struct delta_index_entry *delta_entry) +{ + int result; + struct delta_list *delta_list = delta_entry->delta_list; + + result = ASSERT(!delta_entry->is_collision, + "entry is not a collision"); + if (result != UDS_SUCCESS) { + return result; + } + + delta_list->save_key = delta_entry->key - delta_entry->delta; + delta_list->save_offset = delta_entry->offset; + return UDS_SUCCESS; +} + +static void set_delta(struct delta_index_entry *delta_entry, unsigned int delta) +{ + const struct delta_zone *delta_zone = delta_entry->delta_zone; + int key_bits = (delta_zone->min_bits + + ((delta_zone->incr_keys - + delta_zone->min_keys + delta) / + delta_zone->incr_keys)); + + delta_entry->delta = delta; + delta_entry->entry_bits = delta_entry->value_bits + key_bits; +} + +static void set_collision(struct delta_index_entry *delta_entry) +{ + delta_entry->is_collision = true; + delta_entry->entry_bits += COLLISION_BITS; +} + +/* Get the bit offset of the collision field of an entry. */ +static INLINE uint64_t +get_collision_offset(const struct delta_index_entry *entry) +{ + return (get_delta_entry_offset(entry) + entry->entry_bits - + COLLISION_BITS); +} + +static void get_collision_name(const struct delta_index_entry *entry, + byte *name) +{ + uint64_t offset = get_collision_offset(entry); + const byte *addr = entry->delta_zone->memory + offset / CHAR_BIT; + int size = COLLISION_BYTES; + int shift = offset % CHAR_BIT; + + while (--size >= 0) { + *name++ = get_unaligned_le16(addr++) >> shift; + } +} + +static void set_collision_name(const struct delta_index_entry *entry, + const byte *name) +{ + uint64_t offset = get_collision_offset(entry); + byte *addr = entry->delta_zone->memory + offset / CHAR_BIT; + int size = COLLISION_BYTES; + int shift = offset % CHAR_BIT; + uint16_t mask = ~((uint16_t) 0xFF << shift); + uint16_t data; + + while (--size >= 0) { + data = (get_unaligned_le16(addr) & mask) | (*name++ << shift); + put_unaligned_le16(data, addr++); + } +} + +int get_delta_index_entry(const struct delta_index *delta_index, + unsigned int list_number, + unsigned int key, + const byte *name, + struct delta_index_entry *delta_entry) +{ + int result; + + result = start_delta_index_search(delta_index, + list_number, + key, + delta_entry); + if (result != UDS_SUCCESS) { + return result; + } + + do { + result = next_delta_index_entry(delta_entry); + if (result != UDS_SUCCESS) { + return result; + } + } while (!delta_entry->at_end && (key > delta_entry->key)); + + result = remember_delta_index_offset(delta_entry); + if (result != UDS_SUCCESS) { + return result; + } + + if (!delta_entry->at_end && (key == delta_entry->key)) { + struct delta_index_entry collision_entry; + + collision_entry = *delta_entry; + for (;;) { + byte full_name[COLLISION_BYTES]; + + result = next_delta_index_entry(&collision_entry); + if (result != UDS_SUCCESS) { + return result; + } + + if (collision_entry.at_end || + !collision_entry.is_collision) { + break; + } + + get_collision_name(&collision_entry, full_name); + if (memcmp(full_name, name, COLLISION_BYTES) == 0) { + *delta_entry = collision_entry; + break; + } + } + } + + return UDS_SUCCESS; +} + +int get_delta_entry_collision(const struct delta_index_entry *delta_entry, + byte *name) +{ + int result; + + result = assert_not_at_end(delta_entry); + if (result != UDS_SUCCESS) { + return result; + } + + result = ASSERT_WITH_ERROR_CODE(delta_entry->is_collision, + UDS_BAD_STATE, + "Cannot get full block name from a non-collision delta index entry"); + if (result != UDS_SUCCESS) { + return result; + } + + get_collision_name(delta_entry, name); + return UDS_SUCCESS; +} + +unsigned int get_delta_entry_value(const struct delta_index_entry *delta_entry) +{ + return get_field(delta_entry->delta_zone->memory, + get_delta_entry_offset(delta_entry), + delta_entry->value_bits); +} + +static int assert_mutable_entry(const struct delta_index_entry *delta_entry) +{ + return ASSERT_WITH_ERROR_CODE((delta_entry->delta_list != + &delta_entry->temp_delta_list), + UDS_BAD_STATE, + "delta index is mutable"); +} + +int set_delta_entry_value(const struct delta_index_entry *delta_entry, + unsigned int value) +{ + int result; + unsigned int value_mask = (1 << delta_entry->value_bits) - 1; + + result = assert_mutable_entry(delta_entry); + if (result != UDS_SUCCESS) { + return result; + } + + result = assert_not_at_end(delta_entry); + if (result != UDS_SUCCESS) { + return result; + } + + result = ASSERT_WITH_ERROR_CODE((value & value_mask) == value, + UDS_INVALID_ARGUMENT, + "Value (%u) being set in a delta index is too large (must fit in %u bits)", + value, + delta_entry->value_bits); + if (result != UDS_SUCCESS) { + return result; + } + + set_field(value, + delta_entry->delta_zone->memory, + get_delta_entry_offset(delta_entry), + delta_entry->value_bits); + return UDS_SUCCESS; +} + +/* + * Extend the memory used by the delta lists by adding growing_size + * bytes before the list indicated by growing_index, then rebalancing + * the lists in the new chunk. + */ +static int extend_delta_zone(struct delta_zone *delta_zone, + unsigned int growing_index, + size_t growing_size) +{ + ktime_t start_time; + ktime_t end_time; + struct delta_list *delta_lists; + unsigned int i; + size_t used_space; + + + /* Calculate the amount of space that is or will be in use. */ + start_time = current_time_ns(CLOCK_MONOTONIC); + delta_lists = delta_zone->delta_lists; + used_space = growing_size; + for (i = 0; i <= delta_zone->list_count + 1; i++) { + used_space += get_delta_list_byte_size(&delta_lists[i]); + } + + if (delta_zone->size < used_space) { + return UDS_OVERFLOW; + } + + /* Compute the new offsets of the delta lists. */ + compute_new_list_offsets(delta_zone, + growing_index, + growing_size, + used_space); + + /* + * When we rebalance the delta list, we will include the end guard list + * in the rebalancing. It contains the end guard data, which must be + * copied. + */ + rebalance_delta_zone(delta_zone, 1, delta_zone->list_count + 1); + end_time = current_time_ns(CLOCK_MONOTONIC); + delta_zone->rebalance_count++; + delta_zone->rebalance_time += ktime_sub(end_time, start_time); + return UDS_SUCCESS; +} + +static int insert_bits(struct delta_index_entry *delta_entry, int size) +{ + uint64_t free_before; + uint64_t free_after; + uint64_t source; + uint64_t destination; + uint32_t count; + bool before_flag; + byte *memory; + struct delta_zone *delta_zone = delta_entry->delta_zone; + struct delta_list *delta_list = delta_entry->delta_list; + /* Compute bits in use before and after the inserted bits. */ + uint32_t total_size = delta_list->size; + uint32_t before_size = delta_entry->offset; + uint32_t after_size = total_size - delta_entry->offset; + + if ((unsigned int) (total_size + size) > UINT16_MAX) { + delta_entry->list_overflow = true; + delta_zone->overflow_count++; + return UDS_OVERFLOW; + } + + /* Compute bits available before and after the delta list. */ + free_before = (delta_list[0].start - + (delta_list[-1].start + delta_list[-1].size)); + free_after = (delta_list[1].start - + (delta_list[0].start + delta_list[0].size)); + + if (((unsigned int) size <= free_before) && + ((unsigned int) size <= free_after)) { + /* + * We have enough space to use either before or after the list. + * Select the smaller amount of data. If it is exactly the + * same, try to take from the larger amount of free space. + */ + if (before_size < after_size) { + before_flag = true; + } else if (after_size < before_size) { + before_flag = false; + } else { + before_flag = free_before > free_after; + } + } else if ((unsigned int) size <= free_before) { + /* There is space before but not after. */ + before_flag = true; + } else if ((unsigned int) size <= free_after) { + /* There is space after but not before. */ + before_flag = false; + } else { + /* + * Neither of the surrounding spaces is large enough for this + * request. Extend and/or rebalance the delta list memory + * choosing to move the least amount of data. + */ + int result; + unsigned int growing_index = delta_entry->list_number + 1; + + before_flag = before_size < after_size; + if (!before_flag) { + growing_index++; + } + result = extend_delta_zone(delta_zone, + growing_index, + DIV_ROUND_UP(size, CHAR_BIT)); + if (result != UDS_SUCCESS) { + return result; + } + } + + if (before_flag) { + source = delta_list->start; + destination = source - size; + move_delta_list_start(delta_list, -size); + count = before_size; + } else { + move_delta_list_end(delta_list, size); + source = delta_list->start + delta_entry->offset; + destination = source + size; + count = after_size; + } + + memory = delta_zone->memory; + move_bits(memory, source, memory, destination, count); + return UDS_SUCCESS; +} + +static void encode_delta(const struct delta_index_entry *delta_entry) +{ + unsigned int temp; + unsigned int t1; + unsigned int t2; + uint64_t offset; + const struct delta_zone *delta_zone = delta_entry->delta_zone; + byte *memory = delta_zone->memory; + + offset = get_delta_entry_offset(delta_entry) + delta_entry->value_bits; + if (delta_entry->delta < delta_zone->min_keys) { + set_field(delta_entry->delta, + memory, + offset, + delta_zone->min_bits); + return; + } + + temp = delta_entry->delta - delta_zone->min_keys; + t1 = (temp % delta_zone->incr_keys) + delta_zone->min_keys; + t2 = temp / delta_zone->incr_keys; + set_field(t1, memory, offset, delta_zone->min_bits); + set_zero(memory, offset + delta_zone->min_bits, t2); + set_field(1, memory, offset + delta_zone->min_bits + t2, 1); +} + +static void encode_entry(const struct delta_index_entry *delta_entry, + unsigned int value, + const byte *name) +{ + byte *memory = delta_entry->delta_zone->memory; + uint64_t offset = get_delta_entry_offset(delta_entry); + + set_field(value, memory, offset, delta_entry->value_bits); + encode_delta(delta_entry); + if (name != NULL) { + set_collision_name(delta_entry, name); + } +} + +/* + * Create a new entry in the delta index. If the entry is a collision, the full + * 256 bit name must be provided. + */ +int put_delta_index_entry(struct delta_index_entry *delta_entry, + unsigned int key, + unsigned int value, + const byte *name) +{ + int result; + struct delta_zone *delta_zone; + + result = assert_mutable_entry(delta_entry); + if (result != UDS_SUCCESS) { + return result; + } + + if (delta_entry->is_collision) { + /* + * The caller wants us to insert a collision entry onto a + * collision entry. This happens when we find a collision and + * attempt to add the name again to the index. This is + * normally a fatal error unless we are replaying a closed + * chapter while we are rebuilding a volume index. + */ + return UDS_DUPLICATE_NAME; + } + + if (delta_entry->offset < delta_entry->delta_list->save_offset) { + /* + * The saved entry offset is after the new entry and will no + * longer be valid, so replace it with the insertion point. + */ + result = remember_delta_index_offset(delta_entry); + if (result != UDS_SUCCESS) { + return result; + } + } + + if (name != NULL) { + /* + * Insert a collision entry which is placed after this + * entry. + */ + result = assert_not_at_end(delta_entry); + if (result != UDS_SUCCESS) { + return result; + } + + result = ASSERT((key == delta_entry->key), + "incorrect key for collision entry"); + if (result != UDS_SUCCESS) { + return result; + } + + delta_entry->offset += delta_entry->entry_bits; + set_delta(delta_entry, 0); + set_collision(delta_entry); + result = insert_bits(delta_entry, delta_entry->entry_bits); + } else if (delta_entry->at_end) { + /* Insert a new entry at the end of the delta list. */ + result = ASSERT((key >= delta_entry->key), + "key past end of list"); + if (result != UDS_SUCCESS) { + return result; + } + + set_delta(delta_entry, key - delta_entry->key); + delta_entry->key = key; + delta_entry->at_end = false; + result = insert_bits(delta_entry, delta_entry->entry_bits); + } else { + int old_entry_size; + int additional_size; + struct delta_index_entry next_entry; + unsigned int next_value; + + /* + * Insert a new entry which requires the delta in the following + * entry to be updated. + */ + result = ASSERT((key < delta_entry->key), + "key precedes following entry"); + if (result != UDS_SUCCESS) { + return result; + } + + result = ASSERT((key >= delta_entry->key - delta_entry->delta), + "key effects following entry's delta"); + if (result != UDS_SUCCESS) { + return result; + } + + old_entry_size = delta_entry->entry_bits; + next_entry = *delta_entry; + next_value = get_delta_entry_value(&next_entry); + set_delta(delta_entry, + key - (delta_entry->key - delta_entry->delta)); + delta_entry->key = key; + set_delta(&next_entry, next_entry.key - key); + next_entry.offset += delta_entry->entry_bits; + /* + * The two new entries are always bigger than the single entry + * being replaced. + */ + additional_size = (delta_entry->entry_bits + + next_entry.entry_bits - old_entry_size); + result = insert_bits(delta_entry, additional_size); + if (result != UDS_SUCCESS) { + return result; + } + + encode_entry(&next_entry, next_value, NULL); + } + + if (result != UDS_SUCCESS) { + return result; + } + + encode_entry(delta_entry, value, name); + delta_zone = delta_entry->delta_zone; + delta_zone->record_count++; + delta_zone->collision_count += delta_entry->is_collision ? 1 : 0; + return UDS_SUCCESS; +} + +static void delete_bits(const struct delta_index_entry *delta_entry, int size) +{ + uint64_t source; + uint64_t destination; + uint32_t count; + bool before_flag; + struct delta_list *delta_list = delta_entry->delta_list; + byte *memory = delta_entry->delta_zone->memory; + /* Compute bits retained before and after the deleted bits. */ + uint32_t total_size = delta_list->size; + uint32_t before_size = delta_entry->offset; + uint32_t after_size = total_size - delta_entry->offset - size; + + /* + * Determine whether to add to the available space either before or + * after the delta list. We prefer to move the least amount of data. + * If it is exactly the same, try to add to the smaller amount of free + * space. + */ + if (before_size < after_size) { + before_flag = true; + } else if (after_size < before_size) { + before_flag = false; + } else { + uint64_t free_before = (delta_list[0].start - + (delta_list[-1].start + delta_list[-1].size)); + uint64_t free_after = (delta_list[1].start - + (delta_list[0].start + delta_list[0].size)); + + before_flag = (free_before < free_after); + } + + if (before_flag) { + source = delta_list->start; + destination = source + size; + move_delta_list_start(delta_list, size); + count = before_size; + } else { + move_delta_list_end(delta_list, -size); + destination = delta_list->start + delta_entry->offset; + source = destination + size; + count = after_size; + } + + move_bits(memory, source, memory, destination, count); +} + +int remove_delta_index_entry(struct delta_index_entry *delta_entry) +{ + int result; + struct delta_index_entry next_entry; + struct delta_zone *delta_zone; + struct delta_list *delta_list; + + result = assert_mutable_entry(delta_entry); + if (result != UDS_SUCCESS) { + return result; + } + + next_entry = *delta_entry; + result = next_delta_index_entry(&next_entry); + if (result != UDS_SUCCESS) { + return result; + } + + delta_zone = delta_entry->delta_zone; + + if (delta_entry->is_collision) { + /* This is a collision entry, so just remove it. */ + delete_bits(delta_entry, delta_entry->entry_bits); + next_entry.offset = delta_entry->offset; + delta_zone->collision_count -= 1; + } else if (next_entry.at_end) { + /* This entry is at the end of the list, so just remove it. */ + delete_bits(delta_entry, delta_entry->entry_bits); + next_entry.key -= delta_entry->delta; + next_entry.offset = delta_entry->offset; + } else { + /* The delta in the next entry needs to be updated. */ + unsigned int next_value = get_delta_entry_value(&next_entry); + int old_size = delta_entry->entry_bits + next_entry.entry_bits; + + if (next_entry.is_collision) { + next_entry.is_collision = false; + delta_zone->collision_count -= 1; + } + + set_delta(&next_entry, delta_entry->delta + next_entry.delta); + next_entry.offset = delta_entry->offset; + /* + * The one new entry is always smaller than the two entries + * being replaced. + */ + delete_bits(delta_entry, old_size - next_entry.entry_bits); + encode_entry(&next_entry, next_value, NULL); + } + + delta_zone->record_count--; + delta_zone->discard_count++; + *delta_entry = next_entry; + + delta_list = delta_entry->delta_list; + if (delta_entry->offset < delta_list->save_offset) { + /* The saved entry offset is no longer valid. */ + delta_list->save_key = 0; + delta_list->save_offset = 0; + } + + return UDS_SUCCESS; +} + +unsigned int +get_delta_zone_first_list(const struct delta_index *delta_index, + unsigned int zone_number) +{ + return delta_index->delta_zones[zone_number].first_list; +} + +unsigned int +get_delta_zone_list_count(const struct delta_index *delta_index, + unsigned int zone_number) +{ + return delta_index->delta_zones[zone_number].list_count; +} + +uint64_t +get_delta_zone_bits_used(const struct delta_index *delta_index, + unsigned int zone_number) +{ + unsigned int i; + uint64_t bit_count = 0; + const struct delta_zone *delta_zone; + + delta_zone = &delta_index->delta_zones[zone_number]; + for (i = 0; i < delta_zone->list_count; i++) { + bit_count += delta_zone->delta_lists[i + 1].size; + } + + return bit_count; +} + +uint64_t +get_delta_index_bits_allocated(const struct delta_index *delta_index) +{ + uint64_t byte_count = 0; + unsigned int z; + + for (z = 0; z < delta_index->zone_count; z++) { + byte_count += delta_index->delta_zones[z].size; + } + + return byte_count * CHAR_BIT; +} + +static size_t +get_delta_zone_allocated(const struct delta_zone *delta_zone) +{ + return (delta_zone->size + + (delta_zone->list_count + 2) * sizeof(struct delta_list) + + (delta_zone->list_count + 2) * sizeof(uint64_t)); +} + +void get_delta_index_stats(const struct delta_index *delta_index, + struct delta_index_stats *stats) +{ + unsigned int z; + const struct delta_zone *delta_zone; + + memset(stats, 0, sizeof(struct delta_index_stats)); + stats->memory_allocated = + delta_index->zone_count * sizeof(struct delta_zone); + for (z = 0; z < delta_index->zone_count; z++) { + delta_zone = &delta_index->delta_zones[z]; + stats->memory_allocated += + get_delta_zone_allocated(delta_zone); + stats->rebalance_time += delta_zone->rebalance_time; + stats->rebalance_count += delta_zone->rebalance_count; + stats->record_count += delta_zone->record_count; + stats->collision_count += delta_zone->collision_count; + stats->discard_count += delta_zone->discard_count; + stats->overflow_count += delta_zone->overflow_count; + stats->list_count += delta_zone->list_count; + } +} + +size_t compute_delta_index_size(unsigned long entry_count, + unsigned int mean_delta, + unsigned int payload_bits) +{ + unsigned short min_bits; + unsigned int incr_keys; + unsigned int min_keys; + + compute_coding_constants(mean_delta, &min_bits, &min_keys, &incr_keys); + /* On average, each delta is encoded into about min_bits + 1.5 bits. */ + return (entry_count * (payload_bits + min_bits + 1) + entry_count / 2); +} + +unsigned int get_delta_index_page_count(unsigned int entry_count, + unsigned int list_count, + unsigned int mean_delta, + unsigned int payload_bits, + size_t bytes_per_page) +{ + unsigned int bits_per_delta_list; + unsigned int bits_per_page; + size_t bits_per_index; + + /* Compute the expected number of bits needed for all the entries. */ + bits_per_index = compute_delta_index_size(entry_count, + mean_delta, + payload_bits); + bits_per_delta_list = bits_per_index / list_count; + + /* Add in the immutable delta list headers. */ + bits_per_index += list_count * IMMUTABLE_HEADER_SIZE; + /* Compute the number of usable bits on an immutable index page. */ + bits_per_page = ((bytes_per_page - sizeof(struct delta_page_header)) * + CHAR_BIT); + /* + * Reduce the bits per page by one immutable delta list header and one + * delta list to account for internal fragmentation. + */ + bits_per_page -= IMMUTABLE_HEADER_SIZE + bits_per_delta_list; + /* Now compute the number of pages needed. */ + return DIV_ROUND_UP(bits_per_index, bits_per_page); +} + +void log_delta_index_entry(struct delta_index_entry *delta_entry) +{ + uds_log_ratelimit(uds_log_info, + "List 0x%X Key 0x%X Offset 0x%X%s%s List_size 0x%X%s", + delta_entry->list_number, + delta_entry->key, + delta_entry->offset, + delta_entry->at_end ? " end" : "", + delta_entry->is_collision ? " collision" : "", + delta_entry->delta_list->size, + delta_entry->list_overflow ? " overflow" : ""); + delta_entry->list_overflow = false; +} diff --git a/vdo/delta-index.h b/vdo/delta-index.h new file mode 100644 index 00000000..02dc5908 --- /dev/null +++ b/vdo/delta-index.h @@ -0,0 +1,328 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright Red Hat + */ + +#ifndef DELTAINDEX_H +#define DELTAINDEX_H 1 + +#include "compiler.h" +#include "config.h" +#include "buffered-reader.h" +#include "buffered-writer.h" +#include "compiler.h" +#include "cpu.h" +#include "numeric.h" +#include "time-utils.h" +#include "type-defs.h" + +struct delta_list { + /* The offset of the delta list start, in bits */ + uint64_t start; + /* The number of bits in the delta list */ + uint16_t size; + /* Where the last search "found" the key, in bits */ + uint16_t save_offset; + /* The key for the record just before save_offset */ + unsigned int save_key; +}; + +struct delta_zone { + /* The delta list memory */ + byte *memory; + /* The delta list headers */ + struct delta_list *delta_lists; + /* Temporary starts of delta lists */ + uint64_t *new_offsets; + /* Buffered writer for saving an index */ + struct buffered_writer *buffered_writer; + /* The size of delta list memory */ + size_t size; + /* Nanoseconds spent rebalancing */ + ktime_t rebalance_time; + /* Number of memory rebalances */ + int rebalance_count; + /* The number of bits in a stored value */ + unsigned short value_bits; + /* The number of bits in the minimal key code */ + unsigned short min_bits; + /* The number of keys used in a minimal code */ + unsigned int min_keys; + /* The number of keys used for another code bit */ + unsigned int incr_keys; + /* The number of records in the index */ + long record_count; + /* The number of collision records */ + long collision_count; + /* The number of records removed */ + long discard_count; + /* The number of UDS_OVERFLOW errors detected */ + long overflow_count; + /* The index of the first delta list */ + unsigned int first_list; + /* The number of delta lists */ + unsigned int list_count; + /* Tag belonging to this delta index */ + byte tag; +} __attribute__((aligned(CACHE_LINE_BYTES))); + +struct delta_list_save_info { + /* Tag identifying which delta index this list is in */ + uint8_t tag; + /* Bit offset of the start of the list data */ + uint8_t bit_offset; + /* Number of bytes of list data */ + uint16_t byte_count; + /* The delta list number within the delta index */ + uint32_t index; +}; + +struct delta_index { + /* The zones */ + struct delta_zone *delta_zones; + /* The number of zones */ + unsigned int zone_count; + /* The number of delta lists */ + unsigned int list_count; + /* Maximum lists per zone */ + unsigned int lists_per_zone; + /* The number of non-empty lists at load time per zone */ + unsigned int load_lists[MAX_ZONES]; + /* True if this index is mutable */ + bool mutable; + /* Tag belonging to this delta index */ + byte tag; +}; + +/* + * A delta_index_page describes a single page of a chapter index. The + * delta_index field allows the page to be treated as an immutable delta_index. + * We use the delta_zone field to treat the chapter index page as a single + * zone index, and without the need to do an additional memory allocation. + */ +struct delta_index_page { + struct delta_index delta_index; + /* These values are loaded from the DeltaPageHeader */ + unsigned int lowest_list_number; + unsigned int highest_list_number; + uint64_t virtual_chapter_number; + /* This structure describes the single zone of a delta index page. */ + struct delta_zone delta_zone; +}; + +/* + * Notes on the delta_index_entries: + * + * The fields documented as "public" can be read by any code that uses a + * delta_index. The fields documented as "private" carry information between + * delta_index method calls and should not be used outside the delta_index + * module. + * + * (1) The delta_index_entry is used like an iterator when searching a delta + * list. + * + * (2) It is also the result of a successful search and can be used to refer + * to the element found by the search. + * + * (3) It is also the result of an unsuccessful search and can be used to + * refer to the insertion point for a new record. + * + * (4) If at_end is true, the delta_list entry can only be used as the + * insertion point for a new record at the end of the list. + * + * (5) If at_end is false and is_collision is true, the delta_list entry + * fields refer to a collision entry in the list, and the delta_list entry + * can be used a a reference to this entry. + * + * (6) If at_end is false and is_collision is false, the delta_list entry + * fields refer to a non-collision entry in the list. Such delta_list + * entries can be used as a reference to a found entry, or an insertion + * point for a non-collision entry before this entry, or an insertion + * point for a collision entry that collides with this entry. + */ +struct delta_index_entry { + /* Public fields */ + /* The key for this entry */ + unsigned int key; + /* We are after the last list entry */ + bool at_end; + /* This record is a collision */ + bool is_collision; + + /* Private fields */ + /* This delta list overflowed */ + bool list_overflow; + /* The number of bits used for the value */ + unsigned short value_bits; + /* The number of bits used for the entire entry */ + unsigned short entry_bits; + /* The delta index zone */ + struct delta_zone *delta_zone; + /* The delta list containing the entry */ + struct delta_list *delta_list; + /* The delta list number */ + unsigned int list_number; + /* Bit offset of this entry within the list */ + uint32_t offset; + /* The delta between this and previous entry */ + unsigned int delta; + /* Temporary delta list for immutable indices */ + struct delta_list temp_delta_list; +}; + +struct delta_index_stats { + /* Number of bytes allocated */ + size_t memory_allocated; + /* Nanoseconds spent rebalancing */ + ktime_t rebalance_time; + /* Number of memory rebalances */ + int rebalance_count; + /* The number of records in the index */ + long record_count; + /* The number of collision records */ + long collision_count; + /* The number of records removed */ + long discard_count; + /* The number of UDS_OVERFLOW errors detected */ + long overflow_count; + /* The number of delta lists */ + unsigned int list_count; +}; + +int __must_check initialize_delta_index(struct delta_index *delta_index, + unsigned int zone_count, + unsigned int list_count, + unsigned int mean_delta, + unsigned int payload_bits, + size_t memory_size); + +int __must_check +initialize_delta_index_page(struct delta_index_page *delta_index_page, + uint64_t expected_nonce, + unsigned int mean_delta, + unsigned int payload_bits, + byte *memory, + size_t memory_size); + +void uninitialize_delta_index(struct delta_index *delta_index); + +void empty_delta_index(const struct delta_index *delta_index); + +void empty_delta_zone(const struct delta_index *delta_index, + unsigned int zone_number); + +int __must_check pack_delta_index_page(const struct delta_index *delta_index, + uint64_t header_nonce, + byte *memory, + size_t memory_size, + uint64_t virtual_chapter_number, + unsigned int first_list, + unsigned int *list_count); + +void set_delta_index_tag(struct delta_index *delta_index, byte tag); + +int __must_check +start_restoring_delta_index(struct delta_index *delta_index, + struct buffered_reader **buffered_readers, + unsigned int reader_count); + +int __must_check +finish_restoring_delta_index(struct delta_index *delta_index, + struct buffered_reader **buffered_readers, + unsigned int reader_count); + +void abort_restoring_delta_index(const struct delta_index *delta_index); + +int __must_check +check_guard_delta_lists(struct buffered_reader **buffered_readers, + unsigned int reader_count); + +int __must_check +start_saving_delta_index(const struct delta_index *delta_index, + unsigned int zone_number, + struct buffered_writer *buffered_writer); + +int __must_check +finish_saving_delta_index(const struct delta_index *delta_index, + unsigned int zone_number); + +int __must_check +write_guard_delta_list(struct buffered_writer *buffered_writer); + +size_t __must_check compute_delta_index_save_bytes(unsigned int list_count, + size_t memory_size); + +int __must_check +start_delta_index_search(const struct delta_index *delta_index, + unsigned int list_number, + unsigned int key, + struct delta_index_entry *iterator); + +int __must_check next_delta_index_entry(struct delta_index_entry *delta_entry); + +int __must_check +remember_delta_index_offset(const struct delta_index_entry *delta_entry); + +int __must_check get_delta_index_entry(const struct delta_index *delta_index, + unsigned int list_number, + unsigned int key, + const byte *name, + struct delta_index_entry *delta_entry); + +int __must_check +get_delta_entry_collision(const struct delta_index_entry *delta_entry, + byte *name); + +unsigned int __must_check +get_delta_entry_value(const struct delta_index_entry *delta_entry); + +int __must_check +set_delta_entry_value(const struct delta_index_entry *delta_entry, + unsigned int value); + +int __must_check put_delta_index_entry(struct delta_index_entry *delta_entry, + unsigned int key, + unsigned int value, + const byte *name); + +int __must_check +remove_delta_index_entry(struct delta_index_entry *delta_entry); + +static INLINE unsigned int +get_delta_zone_number(const struct delta_index *delta_index, + unsigned int list_number) +{ + return list_number / delta_index->lists_per_zone; +} + +unsigned int +get_delta_zone_first_list(const struct delta_index *delta_index, + unsigned int zone_number); + +unsigned int +get_delta_zone_list_count(const struct delta_index *delta_index, + unsigned int zone_number); + +uint64_t __must_check +get_delta_zone_bits_used(const struct delta_index *delta_index, + unsigned int zone_number); + +uint64_t __must_check +get_delta_index_bits_allocated(const struct delta_index *delta_index); + +void get_delta_index_stats(const struct delta_index *delta_index, + struct delta_index_stats *stats); + +size_t __must_check compute_delta_index_size(unsigned long entry_count, + unsigned int mean_delta, + unsigned int payload_bits); + +unsigned int get_delta_index_page_count(unsigned int entry_count, + unsigned int list_count, + unsigned int mean_delta, + unsigned int payload_bits, + size_t bytes_per_page); + +void log_delta_index_entry(struct delta_index_entry *delta_entry); + +#endif /* DELTAINDEX_H */ diff --git a/vdo/device-config.c b/vdo/device-config.c new file mode 100644 index 00000000..e40571a4 --- /dev/null +++ b/vdo/device-config.c @@ -0,0 +1,944 @@ +// SPDX-License-Identifier: GPL-2.0-only +/** + * Copyright Red Hat + */ + +#include "device-config.h" + +#include + +#include "errors.h" +#include "logger.h" +#include "memory-alloc.h" +#include "string-utils.h" + +#include "constants.h" +#include "status-codes.h" +#include "types.h" +#include "vdo.h" + +enum { + /* If we bump this, update the arrays below */ + TABLE_VERSION = 4, +}; + +/* arrays for handling different table versions */ +static const uint8_t REQUIRED_ARGC[] = { 10, 12, 9, 7, 6 }; +/* pool name no longer used. only here for verification of older versions */ +static const uint8_t POOL_NAME_ARG_INDEX[] = { 8, 10, 8 }; + +/** + * get_version_number() - Decide the version number from argv. + * + * @argc: The number of table values. + * @argv: The array of table values. + * @error_ptr: A pointer to return a error string in. + * @version_ptr: A pointer to return the version. + * + * Return: VDO_SUCCESS or an error code. + **/ +static int get_version_number(int argc, + char **argv, + char **error_ptr, + unsigned int *version_ptr) +{ + /* version, if it exists, is in a form of V */ + if (sscanf(argv[0], "V%u", version_ptr) == 1) { + if (*version_ptr < 1 || *version_ptr > TABLE_VERSION) { + *error_ptr = "Unknown version number detected"; + return VDO_BAD_CONFIGURATION; + } + } else { + /* V0 actually has no version number in the table string */ + *version_ptr = 0; + } + + /* + * V0 and V1 have no optional parameters. There will always be + * a parameter for thread config, even if it's a "." to show + * it's an empty list. + */ + if (*version_ptr <= 1) { + if (argc != REQUIRED_ARGC[*version_ptr]) { + *error_ptr = + "Incorrect number of arguments for version"; + return VDO_BAD_CONFIGURATION; + } + } else if (argc < REQUIRED_ARGC[*version_ptr]) { + *error_ptr = "Incorrect number of arguments for version"; + return VDO_BAD_CONFIGURATION; + } + + if (*version_ptr != TABLE_VERSION) { + uds_log_warning("Detected version mismatch between kernel module and tools kernel: %d, tool: %d", + TABLE_VERSION, + *version_ptr); + uds_log_warning("Please consider upgrading management tools to match kernel."); + } + return VDO_SUCCESS; +} + +/* + * Free a list of non-NULL string pointers, and then the list itself. + */ +static void free_string_array(char **string_array) +{ + unsigned int offset; + + for (offset = 0; string_array[offset] != NULL; offset++) { + UDS_FREE(string_array[offset]); + } + UDS_FREE(string_array); +} + +/* + * Split the input string into substrings, separated at occurrences of + * the indicated character, returning a null-terminated list of string + * pointers. + * + * The string pointers and the pointer array itself should both be + * freed with UDS_FREE() when no longer needed. This can be done with + * vdo_free_string_array (below) if the pointers in the array are not + * changed. Since the array and copied strings are allocated by this + * function, it may only be used in contexts where allocation is + * permitted. + * + * Empty substrings are not ignored; that is, returned substrings may + * be empty strings if the separator occurs twice in a row. + */ +static int split_string(const char *string, + char separator, + char ***substring_array_ptr) +{ + unsigned int current_substring = 0, substring_count = 1; + const char *s; + char **substrings; + int result; + ptrdiff_t length; + + for (s = string; *s != 0; s++) { + if (*s == separator) { + substring_count++; + } + } + + result = UDS_ALLOCATE(substring_count + 1, + char *, + "string-splitting array", + &substrings); + if (result != UDS_SUCCESS) { + return result; + } + + for (s = string; *s != 0; s++) { + if (*s == separator) { + ptrdiff_t length = s - string; + + result = UDS_ALLOCATE(length + 1, + char, + "split string", + &substrings[current_substring]); + if (result != UDS_SUCCESS) { + free_string_array(substrings); + return result; + } + /* + * Trailing NUL is already in place after allocation; + * deal with the zero or more non-NUL bytes in the + * string. + */ + if (length > 0) { + memcpy(substrings[current_substring], + string, + length); + } + string = s + 1; + current_substring++; + BUG_ON(current_substring >= substring_count); + } + } + /* Process final string, with no trailing separator. */ + BUG_ON(current_substring != (substring_count - 1)); + length = strlen(string); + + result = UDS_ALLOCATE(length + 1, + char, + "split string", + &substrings[current_substring]); + if (result != UDS_SUCCESS) { + free_string_array(substrings); + return result; + } + memcpy(substrings[current_substring], string, length); + current_substring++; + /* substrings[current_substring] is NULL already */ + *substring_array_ptr = substrings; + return UDS_SUCCESS; +} + +/* + * Join the input substrings into one string, joined with the indicated + * character, returning a string. + * array_length is a bound on the number of valid elements in + * substring_array, in case it is not NULL-terminated. + */ +static int join_strings(char **substring_array, size_t array_length, + char separator, char **string_ptr) +{ + size_t string_length = 0; + size_t i; + int result; + char *output, *current_position; + + for (i = 0; (i < array_length) && (substring_array[i] != NULL); i++) { + string_length += strlen(substring_array[i]) + 1; + } + + result = UDS_ALLOCATE(string_length, char, __func__, &output); + + if (result != VDO_SUCCESS) { + return result; + } + + current_position = &output[0]; + + for (i = 0; (i < array_length) && (substring_array[i] != NULL); i++) { + current_position = uds_append_to_buffer(current_position, + output + string_length, + "%s", + substring_array[i]); + *current_position = separator; + current_position++; + } + + /* We output one too many separators; replace the last with a zero byte. */ + if (current_position != output) { + *(current_position - 1) = '\0'; + } + + *string_ptr = output; + return UDS_SUCCESS; +} + +/* + * parse_bool() - Parse a two-valued option into a bool. + * @bool_str: The string value to convert to a bool. + * @true_str: The string value which should be converted to true. + * @false_str: The string value which should be converted to false. + * @bool_ptr: A pointer to return the bool value in. + * + * Return: VDO_SUCCESS or an error if bool_str is neither true_str + * nor false_str. + */ +static inline int __must_check +parse_bool(const char *bool_str, + const char *true_str, + const char *false_str, + bool *bool_ptr) +{ + bool value = false; + + if (strcmp(bool_str, true_str) == 0) { + value = true; + } else if (strcmp(bool_str, false_str) == 0) { + value = false; + } else { + return VDO_BAD_CONFIGURATION; + } + + *bool_ptr = value; + return VDO_SUCCESS; +} + +/** + * process_one_thread_config_spec() - Process one component of a + * thread parameter configuration + * string and update the + * configuration data structure. + * @thread_param_type: The type of thread specified. + * @count: The thread count requested. + * @config: The configuration data structure to update. + * + * If the thread count requested is invalid, a message is logged and + * -EINVAL returned. If the thread name is unknown, a message is logged + * but no error is returned. + * + * Return: VDO_SUCCESS or -EINVAL + */ +static int process_one_thread_config_spec(const char *thread_param_type, + unsigned int count, + struct thread_count_config *config) +{ + /* Handle limited thread parameters */ + if (strcmp(thread_param_type, "bioRotationInterval") == 0) { + if (count == 0) { + uds_log_error("thread config string error: 'bioRotationInterval' of at least 1 is required"); + return -EINVAL; + } else if (count > VDO_BIO_ROTATION_INTERVAL_LIMIT) { + uds_log_error("thread config string error: 'bioRotationInterval' cannot be higher than %d", + VDO_BIO_ROTATION_INTERVAL_LIMIT); + return -EINVAL; + } + config->bio_rotation_interval = count; + return VDO_SUCCESS; + } else if (strcmp(thread_param_type, "logical") == 0) { + if (count > MAX_VDO_LOGICAL_ZONES) { + uds_log_error("thread config string error: at most %d 'logical' threads are allowed", + MAX_VDO_LOGICAL_ZONES); + return -EINVAL; + } + config->logical_zones = count; + return VDO_SUCCESS; + } else if (strcmp(thread_param_type, "physical") == 0) { + if (count > MAX_VDO_PHYSICAL_ZONES) { + uds_log_error("thread config string error: at most %d 'physical' threads are allowed", + MAX_VDO_PHYSICAL_ZONES); + return -EINVAL; + } + config->physical_zones = count; + return VDO_SUCCESS; + } else { + /* Handle other thread count parameters */ + if (count > MAXIMUM_VDO_THREADS) { + uds_log_error("thread config string error: at most %d '%s' threads are allowed", + MAXIMUM_VDO_THREADS, + thread_param_type); + return -EINVAL; + } + + if (strcmp(thread_param_type, "hash") == 0) { + config->hash_zones = count; + return VDO_SUCCESS; + } else if (strcmp(thread_param_type, "cpu") == 0) { + if (count == 0) { + uds_log_error("thread config string error: at least one 'cpu' thread required"); + return -EINVAL; + } + config->cpu_threads = count; + return VDO_SUCCESS; + } else if (strcmp(thread_param_type, "ack") == 0) { + config->bio_ack_threads = count; + return VDO_SUCCESS; + } else if (strcmp(thread_param_type, "bio") == 0) { + if (count == 0) { + uds_log_error("thread config string error: at least one 'bio' thread required"); + return -EINVAL; + } + config->bio_threads = count; + return VDO_SUCCESS; + } + } + + /* + * Don't fail, just log. This will handle version mismatches between + * user mode tools and kernel. + */ + uds_log_info("unknown thread parameter type \"%s\"", + thread_param_type); + return VDO_SUCCESS; +} + +/** + * parse_one_thread_config_spec() - Parse one component of a thread + * parameter configuration string and + * update the configuration data + * structure. + * @spec: The thread parameter specification string. + * @config: The configuration data to be updated. + */ +static int parse_one_thread_config_spec(const char *spec, + struct thread_count_config *config) +{ + unsigned int count; + char **fields; + int result = split_string(spec, '=', &fields); + + if (result != UDS_SUCCESS) { + return result; + } + if ((fields[0] == NULL) || (fields[1] == NULL) || (fields[2] != NULL)) { + uds_log_error("thread config string error: expected thread parameter assignment, saw \"%s\"", + spec); + free_string_array(fields); + return -EINVAL; + } + + result = kstrtouint(fields[1], 10, &count); + if (result != UDS_SUCCESS) { + uds_log_error("thread config string error: integer value needed, found \"%s\"", + fields[1]); + free_string_array(fields); + return result; + } + + result = process_one_thread_config_spec(fields[0], count, config); + free_string_array(fields); + return result; +} + +/** + * parse_thread_config_string() - Parse the configuration string + * passed and update the specified + * counts and other parameters of + * various types of threads to be + * created. + * @string: Thread parameter configuration string. + * @config: The thread configuration data to update. + * + * The configuration string should contain one or more comma-separated specs + * of the form "typename=number"; the supported type names are "cpu", "ack", + * "bio", "bioRotationInterval", "logical", "physical", and "hash". + * + * If an error occurs during parsing of a single key/value pair, we deem + * it serious enough to stop further parsing. + * + * This function can't set the "reason" value the caller wants to pass + * back, because we'd want to format it to say which field was + * invalid, and we can't allocate the "reason" strings dynamically. So + * if an error occurs, we'll log the details and pass back an error. + * + * Return: VDO_SUCCESS or -EINVAL or -ENOMEM + */ +static int parse_thread_config_string(const char *string, + struct thread_count_config *config) +{ + int result = VDO_SUCCESS; + + char **specs; + + if (strcmp(".", string) != 0) { + unsigned int i; + + result = split_string(string, ',', &specs); + if (result != UDS_SUCCESS) { + return result; + } + + for (i = 0; specs[i] != NULL; i++) { + result = parse_one_thread_config_spec(specs[i], config); + if (result != VDO_SUCCESS) { + break; + } + } + free_string_array(specs); + } + return result; +} + +/** + * process_one_key_value_pair() - Process one component of an optional + * parameter string and update the + * configuration data structure. + * @key: The optional parameter key name. + * @value: The optional parameter value. + * @config: The configuration data structure to update. + * + * If the value requested is invalid, a message is logged and -EINVAL + * returned. If the key is unknown, a message is logged but no error + * is returned. + * + * Return: VDO_SUCCESS or -EINVAL + */ +static int process_one_key_value_pair(const char *key, + unsigned int value, + struct device_config *config) +{ + /* Non thread optional parameters */ + if (strcmp(key, "maxDiscard") == 0) { + if (value == 0) { + uds_log_error("optional parameter error: at least one max discard block required"); + return -EINVAL; + } + /* Max discard sectors in blkdev_issue_discard is UINT_MAX >> 9 */ + if (value > (UINT_MAX / VDO_BLOCK_SIZE)) { + uds_log_error("optional parameter error: at most %d max discard blocks are allowed", + UINT_MAX / VDO_BLOCK_SIZE); + return -EINVAL; + } + config->max_discard_blocks = value; + return VDO_SUCCESS; + } + /* Handles unknown key names */ + return process_one_thread_config_spec(key, value, + &config->thread_counts); +} + +/** + * parse_one_key_value_pair() - Parse one key/value pair and update + * the configuration data structure. + * @key: The optional key name. + * @value: The optional value. + * @config: The configuration data to be updated. + * + * Return: VDO_SUCCESS or error. + */ +static int parse_one_key_value_pair(const char *key, + const char *value, + struct device_config *config) +{ + unsigned int count; + int result; + + if (strcmp(key, "deduplication") == 0) { + return parse_bool(value, "on", "off", &config->deduplication); + } + + if (strcmp(key, "compression") == 0) { + return parse_bool(value, "on", "off", &config->compression); + } + + /* The remaining arguments must have integral values. */ + result = kstrtouint(value, 10, &count); + if (result != UDS_SUCCESS) { + uds_log_error("optional config string error: integer value needed, found \"%s\"", + value); + return result; + } + return process_one_key_value_pair(key, count, config); +} + +/** + * parse_key_value_pairs() - Parse all key/value pairs from a list of + * arguments. + * @argc: The total number of arguments in list. + * @argv: The list of key/value pairs. + * @config: The device configuration data to update. + * + * If an error occurs during parsing of a single key/value pair, we deem + * it serious enough to stop further parsing. + * + * This function can't set the "reason" value the caller wants to pass + * back, because we'd want to format it to say which field was + * invalid, and we can't allocate the "reason" strings dynamically. So + * if an error occurs, we'll log the details and return the error. + * + * Return: VDO_SUCCESS or error + */ +static int parse_key_value_pairs(int argc, + char **argv, + struct device_config *config) +{ + int result = VDO_SUCCESS; + + while (argc) { + result = parse_one_key_value_pair(argv[0], argv[1], config); + if (result != VDO_SUCCESS) { + break; + } + + argc -= 2; + argv += 2; + } + + return result; +} + +/** + * parse_optional_arguments() - Parse the configuration string passed + * in for optional arguments. + * @arg_set: The structure holding the arguments to parse. + * @error_ptr: Pointer to a buffer to hold the error string. + * @config: Pointer to device configuration data to update. + * + * For V0/V1 configurations, there will only be one optional parameter; + * the thread configuration. The configuration string should contain + * one or more comma-separated specs of the form "typename=number"; the + * supported type names are "cpu", "ack", "bio", "bioRotationInterval", + * "logical", "physical", and "hash". + * + * For V2 configurations and beyond, there could be any number of + * arguments. They should contain one or more key/value pairs + * separated by a space. + * + * Return: VDO_SUCCESS or error + */ +int parse_optional_arguments(struct dm_arg_set *arg_set, + char **error_ptr, + struct device_config *config) +{ + int result = VDO_SUCCESS; + + if (config->version == 0 || config->version == 1) { + result = parse_thread_config_string(arg_set->argv[0], + &config->thread_counts); + if (result != VDO_SUCCESS) { + *error_ptr = "Invalid thread-count configuration"; + return VDO_BAD_CONFIGURATION; + } + } else { + if ((arg_set->argc % 2) != 0) { + *error_ptr = "Odd number of optional arguments given but they should be pairs"; + return VDO_BAD_CONFIGURATION; + } + result = parse_key_value_pairs(arg_set->argc, + arg_set->argv, + config); + if (result != VDO_SUCCESS) { + *error_ptr = "Invalid optional argument configuration"; + return VDO_BAD_CONFIGURATION; + } + } + return result; +} + +/** + * handle_parse_error() - Handle a parsing error. + * @config: The config to free. + * @error_ptr: A place to store a constant string about the error. + * @error_str: A constant string to store in error_ptr. + */ +static void handle_parse_error(struct device_config *config, + char **error_ptr, + char *error_str) +{ + vdo_free_device_config(config); + *error_ptr = error_str; +} + +/** + * vdo_parse_device_config() - Convert the dmsetup table into a struct + * device_config. + * @argc: The number of table values. + * @argv: The array of table values. + * @ti: The target structure for this table. + * @config_ptr: A pointer to return the allocated config. + * + * Return: VDO_SUCCESS or an error code. + */ +int vdo_parse_device_config(int argc, + char **argv, + struct dm_target *ti, + struct device_config **config_ptr) +{ + bool enable_512e; + size_t logical_bytes = to_bytes(ti->len); + struct dm_arg_set arg_set; + char **error_ptr = &ti->error; + struct device_config *config = NULL; + int result; + + + if ((logical_bytes % VDO_BLOCK_SIZE) != 0) { + handle_parse_error(config, + error_ptr, + "Logical size must be a multiple of 4096"); + return VDO_BAD_CONFIGURATION; + } + + result = UDS_ALLOCATE(1, + struct device_config, + "device_config", + &config); + if (result != VDO_SUCCESS) { + handle_parse_error(config, + error_ptr, + "Could not allocate config structure"); + return VDO_BAD_CONFIGURATION; + } + + config->owning_target = ti; + config->logical_blocks = logical_bytes / VDO_BLOCK_SIZE; + INIT_LIST_HEAD(&config->config_list); + + /* Save the original string. */ + result = join_strings(argv, argc, ' ', &config->original_string); + if (result != VDO_SUCCESS) { + handle_parse_error(config, + error_ptr, + "Could not populate string"); + return VDO_BAD_CONFIGURATION; + } + + /* Set defaults. + * + * XXX Defaults for bio_threads and bio_rotation_interval are currently + * defined using the old configuration scheme of constants. These + * values are relied upon for performance testing on MGH machines + * currently. This should be replaced with the normally used testing + * defaults being defined in the file-based thread-configuration + * settings. The values used as defaults internally should really be + * those needed for VDO in its default shipped-product state. + */ + config->thread_counts = (struct thread_count_config) { + .bio_ack_threads = 1, + .bio_threads = DEFAULT_VDO_BIO_SUBMIT_QUEUE_COUNT, + .bio_rotation_interval = + DEFAULT_VDO_BIO_SUBMIT_QUEUE_ROTATE_INTERVAL, + .cpu_threads = 1, + .logical_zones = 0, + .physical_zones = 0, + .hash_zones = 0, + }; + config->max_discard_blocks = 1; + config->deduplication = true; + config->compression = false; + + arg_set.argc = argc; + arg_set.argv = argv; + + result = get_version_number(argc, argv, error_ptr, &config->version); + if (result != VDO_SUCCESS) { + /* get_version_number sets error_ptr itself. */ + handle_parse_error(config, error_ptr, *error_ptr); + return result; + } + /* Move the arg pointer forward only if the argument was there. */ + if (config->version >= 1) { + dm_shift_arg(&arg_set); + } + + result = uds_duplicate_string(dm_shift_arg(&arg_set), + "parent device name", + &config->parent_device_name); + if (result != VDO_SUCCESS) { + handle_parse_error(config, + error_ptr, + "Could not copy parent device name"); + return VDO_BAD_CONFIGURATION; + } + + /* Get the physical blocks, if known. */ + if (config->version >= 1) { + result = kstrtoull(dm_shift_arg(&arg_set), + 10, + &config->physical_blocks); + if (result != VDO_SUCCESS) { + handle_parse_error(config, + error_ptr, + "Invalid physical block count"); + return VDO_BAD_CONFIGURATION; + } + } + + /* Get the logical block size and validate */ + result = parse_bool(dm_shift_arg(&arg_set), + "512", + "4096", + &enable_512e); + if (result != VDO_SUCCESS) { + handle_parse_error(config, + error_ptr, + "Invalid logical block size"); + return VDO_BAD_CONFIGURATION; + } + config->logical_block_size = (enable_512e ? 512 : 4096); + + /* Skip past the two no longer used read cache options. */ + if (config->version <= 1) { + dm_consume_args(&arg_set, 2); + } + + /* Get the page cache size. */ + result = kstrtouint(dm_shift_arg(&arg_set), 10, &config->cache_size); + if (result != VDO_SUCCESS) { + handle_parse_error(config, + error_ptr, + "Invalid block map page cache size"); + return VDO_BAD_CONFIGURATION; + } + + /* Get the block map era length. */ + result = kstrtouint(dm_shift_arg(&arg_set), 10, + &config->block_map_maximum_age); + if (result != VDO_SUCCESS) { + handle_parse_error(config, + error_ptr, + "Invalid block map maximum age"); + return VDO_BAD_CONFIGURATION; + } + + /* Skip past the no longer used MD RAID5 optimization mode */ + if (config->version <= 2) { + dm_consume_args(&arg_set, 1); + } + + /* Skip past the no longer used write policy setting */ + if (config->version <= 3) { + dm_consume_args(&arg_set, 1); + } + + /* Skip past the no longer used pool name for older table lines */ + if (config->version <= 2) { + /* + * Make sure the enum to get the pool name from argv directly + * is still in sync with the parsing of the table line. + */ + if (&arg_set.argv[0] != + &argv[POOL_NAME_ARG_INDEX[config->version]]) { + handle_parse_error(config, + error_ptr, + "Pool name not in expected location"); + return VDO_BAD_CONFIGURATION; + } + dm_shift_arg(&arg_set); + } + + /* Get the optional arguments and validate. */ + result = parse_optional_arguments(&arg_set, error_ptr, config); + if (result != VDO_SUCCESS) { + /* parse_optional_arguments sets error_ptr itself. */ + handle_parse_error(config, error_ptr, *error_ptr); + return result; + } + + /* + * Logical, physical, and hash zone counts can all be zero; then we get + * one thread doing everything, our older configuration. If any zone + * count is non-zero, the others must be as well. + */ + if (((config->thread_counts.logical_zones == 0) != + (config->thread_counts.physical_zones == 0)) || + ((config->thread_counts.physical_zones == 0) != + (config->thread_counts.hash_zones == 0))) { + handle_parse_error(config, + error_ptr, + "Logical, physical, and hash zones counts must all be zero or all non-zero"); + return VDO_BAD_CONFIGURATION; + } + + if (config->cache_size < + (2 * MAXIMUM_VDO_USER_VIOS * config->thread_counts.logical_zones)) { + handle_parse_error(config, + error_ptr, + "Insufficient block map cache for logical zones"); + return VDO_BAD_CONFIGURATION; + } + + result = dm_get_device(ti, + config->parent_device_name, + dm_table_get_mode(ti->table), + &config->owned_device); + if (result != 0) { + uds_log_error("couldn't open device \"%s\": error %d", + config->parent_device_name, + result); + handle_parse_error(config, + error_ptr, + "Unable to open storage device"); + return VDO_BAD_CONFIGURATION; + } + + if (config->version == 0) { + uint64_t device_size = + i_size_read(config->owned_device->bdev->bd_inode); + + config->physical_blocks = device_size / VDO_BLOCK_SIZE; + } + + *config_ptr = config; + return result; +} + +/** + * vdo_free_device_config() - Free a device config created by + * vdo_parse_device_config(). + * @config: The config to free. + */ +void vdo_free_device_config(struct device_config *config) +{ + if (config == NULL) { + return; + } + + if (config->owned_device != NULL) { + dm_put_device(config->owning_target, config->owned_device); + } + + UDS_FREE(config->parent_device_name); + UDS_FREE(config->original_string); + + /* + * Reduce the chance a use-after-free (as in BZ 1669960) happens to + * work. + */ + memset(config, 0, sizeof(*config)); + UDS_FREE(config); +} + +/** + * vdo_set_device_config() - Acquire or release a reference from the + * config to a vdo. + * @config: The config in question. + * @vdo: The vdo in question. + */ +void vdo_set_device_config(struct device_config *config, struct vdo *vdo) +{ + list_del_init(&config->config_list); + if (vdo != NULL) { + list_add_tail(&config->config_list, &vdo->device_config_list); + + } + + config->vdo = vdo; +} + +/** + * vdo_validate_new_device_config() - Check whether a new device + * config represents a valid + * modification to an existing + * config. + * @to_validate: The new config to valudate. + * @config: The existing config. + * @may_grow: Set to true if growing the logical and physical size of + * the vdo is currently permitted. + * @error_ptr: A pointer to hold the reason for any error. + * + * Return: VDO_SUCCESS or an error. + */ +int vdo_validate_new_device_config(struct device_config *to_validate, + struct device_config *config, + bool may_grow, + char **error_ptr) +{ + if (to_validate->owning_target->begin != + config->owning_target->begin) { + *error_ptr = "Starting sector cannot change"; + return VDO_PARAMETER_MISMATCH; + } + + if (to_validate->logical_block_size != config->logical_block_size) { + *error_ptr = "Logical block size cannot change"; + return VDO_PARAMETER_MISMATCH; + } + + if (to_validate->logical_blocks < config->logical_blocks) { + *error_ptr = "Can't shrink VDO logical size"; + return VDO_PARAMETER_MISMATCH; + } + + if (!may_grow + && (to_validate->logical_blocks > config->logical_blocks)) { + *error_ptr = "VDO logical size may not grow in current state"; + return VDO_NOT_IMPLEMENTED; + } + + if (to_validate->cache_size != config->cache_size) { + *error_ptr = "Block map cache size cannot change"; + return VDO_PARAMETER_MISMATCH; + } + + if (to_validate->block_map_maximum_age != + config->block_map_maximum_age) { + *error_ptr = "Block map maximum age cannot change"; + return VDO_PARAMETER_MISMATCH; + } + + if (memcmp(&config->thread_counts, &config->thread_counts, + sizeof(struct thread_count_config)) != 0) { + *error_ptr = "Thread configuration cannot change"; + return VDO_PARAMETER_MISMATCH; + } + + if (to_validate->physical_blocks < config->physical_blocks) { + *error_ptr = "Removing physical storage from a VDO is not supported"; + return VDO_NOT_IMPLEMENTED; + } + + if (!may_grow + && (to_validate->physical_blocks > config->physical_blocks)) { + *error_ptr = "VDO physical size may not grow in current state"; + return VDO_NOT_IMPLEMENTED; + } + + return VDO_SUCCESS; +} diff --git a/vdo/device-config.h b/vdo/device-config.h new file mode 100644 index 00000000..438d90ad --- /dev/null +++ b/vdo/device-config.h @@ -0,0 +1,86 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright Red Hat + */ +#ifndef DEVICE_CONFIG_H +#define DEVICE_CONFIG_H + +#include +#include + +#include "types.h" + +#include "kernel-types.h" + +/* + * This structure is memcmp'd for equality. Keep it packed and don't add any + * fields that are not properly set in both extant and parsed configs. + */ +struct thread_count_config { + int bio_ack_threads; + int bio_threads; + int bio_rotation_interval; + int cpu_threads; + int logical_zones; + int physical_zones; + int hash_zones; +} __packed; + +struct device_config { + struct dm_target *owning_target; + struct dm_dev *owned_device; + struct vdo *vdo; + /* All configs referencing a layer are kept on a list in the layer */ + struct list_head config_list; + char *original_string; + unsigned int version; + char *parent_device_name; + block_count_t physical_blocks; + /* + * This is the number of logical blocks from VDO's internal point of + * view. It is the number of 4K blocks regardles of the value of the + * logical_block_size parameter below. + */ + block_count_t logical_blocks; + unsigned int logical_block_size; + unsigned int cache_size; + unsigned int block_map_maximum_age; + bool deduplication; + bool compression; + struct thread_count_config thread_counts; + block_count_t max_discard_blocks; +}; + +/** + * vdo_as_device_config() - Convert a list entry to the device_config that + * contains it. + * @entry: The list entry to convert. + * + * If non-NULL, the list must not be empty. + * + * Return: The device_config wrapping the list entry. + */ +static inline struct device_config *vdo_as_device_config(struct list_head *entry) +{ + if (entry == NULL) { + return NULL; + } + return list_entry(entry, struct device_config, config_list); +} + +int __must_check vdo_parse_device_config(int argc, + char **argv, + struct dm_target *ti, + struct device_config **config_ptr); + +void vdo_free_device_config(struct device_config *config); + +void vdo_set_device_config(struct device_config *config, struct vdo *vdo); + +int __must_check +vdo_validate_new_device_config(struct device_config *to_validate, + struct device_config *config, + bool may_grow, + char **error_ptr); + +#endif /* DEVICE_CONFIG_H */ diff --git a/vdo/device-registry.c b/vdo/device-registry.c new file mode 100644 index 00000000..2fde3bda --- /dev/null +++ b/vdo/device-registry.c @@ -0,0 +1,126 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright Red Hat + */ + +#include "device-registry.h" + +#include +#include +#include + +#include "memory-alloc.h" +#include "permassert.h" + +#include "kernel-types.h" +#include "status-codes.h" +#include "types.h" +#include "vdo.h" + +/* + * We don't expect this set to ever get really large, so a linked list + * is adequate. We can use a pointer_map if we need to later. + */ +struct device_registry { + struct list_head links; + /* + * XXX: (Some) Kernel docs say rwlocks are being deprecated in favor of + * RCU, please don't add more. Should we switch? + */ + rwlock_t lock; +}; + +static struct device_registry registry; + +/** + * vdo_initialize_device_registry_once() - Initialize the necessary + * structures for the device registry. + */ +void vdo_initialize_device_registry_once(void) +{ + INIT_LIST_HEAD(®istry.links); + rwlock_init(®istry.lock); +} + +/** + * vdo_is_equal() - Implements vdo_filter_t. + */ +static bool vdo_is_equal(struct vdo *vdo, void *context) +{ + return ((void *) vdo == context); +} + +/** + * filter_vdos_locked() - Find a vdo in the registry if it exists there. + * @filter: The filter function to apply to devices. + * @context: A bit of context to provide the filter. + * + * Context: Must be called holding the lock. + * + * Return: the vdo object found, if any. + */ +static struct vdo * __must_check +filter_vdos_locked(vdo_filter_t *filter, void *context) +{ + struct vdo *vdo; + + list_for_each_entry(vdo, ®istry.links, registration) { + if (filter(vdo, context)) { + return vdo; + } + } + + return NULL; +} + +/** + * vdo_register() - Register a VDO; it must not already be registered. + * @vdo: The vdo to register. + * + * Return: VDO_SUCCESS or an error. + */ +int vdo_register(struct vdo *vdo) +{ + int result; + + write_lock(®istry.lock); + result = ASSERT(filter_vdos_locked(vdo_is_equal, vdo) == NULL, + "VDO not already registered"); + if (result == VDO_SUCCESS) { + INIT_LIST_HEAD(&vdo->registration); + list_add_tail(&vdo->registration, ®istry.links); + } + write_unlock(®istry.lock); + + return result; +} + +/** + * vdo_unregister() - Remove a vdo from the device registry. + * @vdo: The vdo to remove. + */ +void vdo_unregister(struct vdo *vdo) +{ + write_lock(®istry.lock); + if (filter_vdos_locked(vdo_is_equal, vdo) == vdo) { + list_del_init(&vdo->registration); + } + + write_unlock(®istry.lock); +} + +/** + * vdo_find_matching() - Find and return the first (if any) vdo matching a + * given filter function. + * @filter: The filter function to apply to vdos. + * @context: A bit of context to provide the filter. + */ +struct vdo *vdo_find_matching(vdo_filter_t *filter, void *context) +{ + struct vdo *vdo; + + read_lock(®istry.lock); + vdo = filter_vdos_locked(filter, context); + read_unlock(®istry.lock); + return vdo; +} diff --git a/vdo/device-registry.h b/vdo/device-registry.h new file mode 100644 index 00000000..d0b55c21 --- /dev/null +++ b/vdo/device-registry.h @@ -0,0 +1,28 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright Red Hat + */ + +#ifndef DEVICE_REGISTRY_H +#define DEVICE_REGISTRY_H + +#include "kernel-types.h" +#include "types.h" + +/** + * typedef vdo_filter_t - Method type for vdo matching methods. + * + * A filter function returns false if the vdo doesn't match. + */ +typedef bool vdo_filter_t(struct vdo *vdo, void *context); + +void vdo_initialize_device_registry_once(void); + +int __must_check vdo_register(struct vdo *vdo); + +void vdo_unregister(struct vdo *vdo); + +struct vdo * __must_check +vdo_find_matching(vdo_filter_t *filter, void *context); + +#endif /* DEVICE_REGISTRY_H */ diff --git a/vdo/deviceConfig.c b/vdo/deviceConfig.c deleted file mode 100644 index 999c4d4c..00000000 --- a/vdo/deviceConfig.c +++ /dev/null @@ -1,720 +0,0 @@ -/** - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/kernel/deviceConfig.c#27 $ - */ - -#include "deviceConfig.h" - -#include - -#include "logger.h" -#include "memoryAlloc.h" -#include "stringUtils.h" - -#include "types.h" -#include "vdoInternal.h" - -#include "vdoStringUtils.h" - -#include "constants.h" - -enum { - // If we bump this, update the arrays below - TABLE_VERSION = 4, - // Limits used when parsing thread-count config spec strings - BIO_ROTATION_INTERVAL_LIMIT = 1024, - LOGICAL_THREAD_COUNT_LIMIT = 60, - PHYSICAL_THREAD_COUNT_LIMIT = 16, - THREAD_COUNT_LIMIT = 100, - // XXX The bio-submission queue configuration defaults are temporarily - // still being defined here until the new runtime-based thread - // configuration has been fully implemented for managed VDO devices. - - // How many bio submission work queues to use - DEFAULT_NUM_BIO_SUBMIT_QUEUES = 4, - // How often to rotate between bio submission work queues - DEFAULT_BIO_SUBMIT_QUEUE_ROTATE_INTERVAL = 64, -}; - -// arrays for handling different table versions -static const uint8_t REQUIRED_ARGC[] = { 10, 12, 9, 7, 6 }; -// pool name no longer used. only here for verification of older versions -static const uint8_t POOL_NAME_ARG_INDEX[] = { 8, 10, 8 }; - -/** - * Decide the version number from argv. - * - * @param [in] argc The number of table values - * @param [in] argv The array of table values - * @param [out] error_ptr A pointer to return a error string in - * @param [out] version_ptr A pointer to return the version - * - * @return VDO_SUCCESS or an error code - **/ -static int get_version_number(int argc, - char **argv, - char **error_ptr, - unsigned int *version_ptr) -{ - // version, if it exists, is in a form of V - if (sscanf(argv[0], "V%u", version_ptr) == 1) { - if (*version_ptr < 1 || *version_ptr > TABLE_VERSION) { - *error_ptr = "Unknown version number detected"; - return VDO_BAD_CONFIGURATION; - } - } else { - // V0 actually has no version number in the table string - *version_ptr = 0; - } - - // V0 and V1 have no optional parameters. There will always be - // a parameter for thread config, even if it's a "." to show - // it's an empty list. - if (*version_ptr <= 1) { - if (argc != REQUIRED_ARGC[*version_ptr]) { - *error_ptr = - "Incorrect number of arguments for version"; - return VDO_BAD_CONFIGURATION; - } - } else if (argc < REQUIRED_ARGC[*version_ptr]) { - *error_ptr = "Incorrect number of arguments for version"; - return VDO_BAD_CONFIGURATION; - } - - if (*version_ptr != TABLE_VERSION) { - uds_log_warning("Detected version mismatch between kernel module and tools kernel: %d, tool: %d", - TABLE_VERSION, - *version_ptr); - uds_log_warning("Please consider upgrading management tools to match kernel."); - } - return VDO_SUCCESS; -} - -/** - * Parse a two-valued option into a bool. - * - * @param [in] bool_str The string value to convert to a bool - * @param [in] true_str The string value which should be converted to true - * @param [in] false_str The string value which should be converted to false - * @param [out] bool_ptr A pointer to return the bool value in - * - * @return VDO_SUCCESS or an error if bool_str is neither true_str - * nor false_str - **/ -static inline int __must_check -parse_bool(const char *bool_str, - const char *true_str, - const char *false_str, - bool *bool_ptr) -{ - bool value = false; - - if (strcmp(bool_str, true_str) == 0) { - value = true; - } else if (strcmp(bool_str, false_str) == 0) { - value = false; - } else { - return VDO_BAD_CONFIGURATION; - } - - *bool_ptr = value; - return VDO_SUCCESS; -} - -/** - * Process one component of a thread parameter configuration string and - * update the configuration data structure. - * - * If the thread count requested is invalid, a message is logged and - * -EINVAL returned. If the thread name is unknown, a message is logged - * but no error is returned. - * - * @param thread_param_type The type of thread specified - * @param count The thread count requested - * @param config The configuration data structure to update - * - * @return VDO_SUCCESS or -EINVAL - **/ -static int process_one_thread_config_spec(const char *thread_param_type, - unsigned int count, - struct thread_count_config *config) -{ - // Handle limited thread parameters - if (strcmp(thread_param_type, "bioRotationInterval") == 0) { - if (count == 0) { - uds_log_error("thread config string error: 'bioRotationInterval' of at least 1 is required"); - return -EINVAL; - } else if (count > BIO_ROTATION_INTERVAL_LIMIT) { - uds_log_error("thread config string error: 'bioRotationInterval' cannot be higher than %d", - BIO_ROTATION_INTERVAL_LIMIT); - return -EINVAL; - } - config->bio_rotation_interval = count; - return VDO_SUCCESS; - } else if (strcmp(thread_param_type, "logical") == 0) { - if (count > LOGICAL_THREAD_COUNT_LIMIT) { - uds_log_error("thread config string error: at most %d 'logical' threads are allowed", - LOGICAL_THREAD_COUNT_LIMIT); - return -EINVAL; - } - config->logical_zones = count; - return VDO_SUCCESS; - } else if (strcmp(thread_param_type, "physical") == 0) { - if (count > PHYSICAL_THREAD_COUNT_LIMIT) { - uds_log_error("thread config string error: at most %d 'physical' threads are allowed", - PHYSICAL_THREAD_COUNT_LIMIT); - return -EINVAL; - } - config->physical_zones = count; - return VDO_SUCCESS; - } else { - // Handle other thread count parameters - if (count > THREAD_COUNT_LIMIT) { - uds_log_error("thread config string error: at most %d '%s' threads are allowed", - THREAD_COUNT_LIMIT, - thread_param_type); - return -EINVAL; - } - - if (strcmp(thread_param_type, "hash") == 0) { - config->hash_zones = count; - return VDO_SUCCESS; - } else if (strcmp(thread_param_type, "cpu") == 0) { - if (count == 0) { - uds_log_error("thread config string error: at least one 'cpu' thread required"); - return -EINVAL; - } - config->cpu_threads = count; - return VDO_SUCCESS; - } else if (strcmp(thread_param_type, "ack") == 0) { - config->bio_ack_threads = count; - return VDO_SUCCESS; - } else if (strcmp(thread_param_type, "bio") == 0) { - if (count == 0) { - uds_log_error("thread config string error: at least one 'bio' thread required"); - return -EINVAL; - } - config->bio_threads = count; - return VDO_SUCCESS; - } - } - - // Don't fail, just log. This will handle version mismatches between - // user mode tools and kernel. - uds_log_info("unknown thread parameter type \"%s\"", - thread_param_type); - return VDO_SUCCESS; -} - -/** - * Parse one component of a thread parameter configuration string and - * update the configuration data structure. - * - * @param spec The thread parameter specification string - * @param config The configuration data to be updated - **/ -static int parse_one_thread_config_spec(const char *spec, - struct thread_count_config *config) -{ - unsigned int count; - char **fields; - int result = vdo_split_string(spec, '=', &fields); - - if (result != UDS_SUCCESS) { - return result; - } - if ((fields[0] == NULL) || (fields[1] == NULL) || (fields[2] != NULL)) { - uds_log_error("thread config string error: expected thread parameter assignment, saw \"%s\"", - spec); - vdo_free_string_array(fields); - return -EINVAL; - } - - result = vdo_string_to_uint(fields[1], &count); - if (result != UDS_SUCCESS) { - uds_log_error("thread config string error: integer value needed, found \"%s\"", - fields[1]); - vdo_free_string_array(fields); - return result; - } - - result = process_one_thread_config_spec(fields[0], count, config); - vdo_free_string_array(fields); - return result; -} - -/** - * Parse the configuration string passed and update the specified - * counts and other parameters of various types of threads to be created. - * - * The configuration string should contain one or more comma-separated specs - * of the form "typename=number"; the supported type names are "cpu", "ack", - * "bio", "bioRotationInterval", "logical", "physical", and "hash". - * - * If an error occurs during parsing of a single key/value pair, we deem - * it serious enough to stop further parsing. - * - * This function can't set the "reason" value the caller wants to pass - * back, because we'd want to format it to say which field was - * invalid, and we can't allocate the "reason" strings dynamically. So - * if an error occurs, we'll log the details and pass back an error. - * - * @param string Thread parameter configuration string - * @param config The thread configuration data to update - * - * @return VDO_SUCCESS or -EINVAL or -ENOMEM - **/ -static int parse_thread_config_string(const char *string, - struct thread_count_config *config) -{ - int result = VDO_SUCCESS; - - char **specs; - - if (strcmp(".", string) != 0) { - unsigned int i; - result = vdo_split_string(string, ',', &specs); - if (result != UDS_SUCCESS) { - return result; - } - - for (i = 0; specs[i] != NULL; i++) { - result = parse_one_thread_config_spec(specs[i], config); - if (result != VDO_SUCCESS) { - break; - } - } - vdo_free_string_array(specs); - } - return result; -} - -/** - * Process one component of an optional parameter string and update - * the configuration data structure. - * - * If the value requested is invalid, a message is logged and -EINVAL - * returned. If the key is unknown, a message is logged but no error - * is returned. - * - * @param key The optional parameter key name - * @param value The optional parameter value - * @param config The configuration data structure to update - * - * @return VDO_SUCCESS or -EINVAL - **/ -static int process_one_key_value_pair(const char *key, - unsigned int value, - struct device_config *config) -{ - // Non thread optional parameters - if (strcmp(key, "maxDiscard") == 0) { - if (value == 0) { - uds_log_error("optional parameter error: at least one max discard block required"); - return -EINVAL; - } - // Max discard sectors in blkdev_issue_discard is UINT_MAX >> 9 - if (value > (UINT_MAX / VDO_BLOCK_SIZE)) { - uds_log_error("optional parameter error: at most %d max discard blocks are allowed", - UINT_MAX / VDO_BLOCK_SIZE); - return -EINVAL; - } - config->max_discard_blocks = value; - return VDO_SUCCESS; - } - // Handles unknown key names - return process_one_thread_config_spec(key, value, - &config->thread_counts); -} - -/** - * Parse one key/value pair and update the configuration - * data structure. - * - * @param key The optional key name - * @param value The optional value - * @param config The configuration data to be updated - * - * @return VDO_SUCCESS or error - **/ -static int parse_one_key_value_pair(const char *key, - const char *value, - struct device_config *config) -{ - unsigned int count; - int result; - - if (strcmp(key, "deduplication") == 0) { - return parse_bool(value, "on", "off", &config->deduplication); - } - - if (strcmp(key, "compression") == 0) { - return parse_bool(value, "on", "off", &config->compression); - } - - // The remaining arguments must have integral values. - result = vdo_string_to_uint(value, &count); - if (result != UDS_SUCCESS) { - uds_log_error("optional config string error: integer value needed, found \"%s\"", - value); - return result; - } - return process_one_key_value_pair(key, count, config); -} - -/** - * Parse all key/value pairs from a list of arguments. - * - * If an error occurs during parsing of a single key/value pair, we deem - * it serious enough to stop further parsing. - * - * This function can't set the "reason" value the caller wants to pass - * back, because we'd want to format it to say which field was - * invalid, and we can't allocate the "reason" strings dynamically. So - * if an error occurs, we'll log the details and return the error. - * - * @param argc The total number of arguments in list - * @param argv The list of key/value pairs - * @param config The device configuration data to update - * - * @return VDO_SUCCESS or error - **/ -static int parse_key_value_pairs(int argc, - char **argv, - struct device_config *config) -{ - int result = VDO_SUCCESS; - - while (argc) { - result = parse_one_key_value_pair(argv[0], argv[1], config); - if (result != VDO_SUCCESS) { - break; - } - - argc -= 2; - argv += 2; - } - - return result; -} - -/** - * Parse the configuration string passed in for optional arguments. - * - * For V0/V1 configurations, there will only be one optional parameter; - * the thread configuration. The configuration string should contain - * one or more comma-separated specs of the form "typename=number"; the - * supported type names are "cpu", "ack", "bio", "bioRotationInterval", - * "logical", "physical", and "hash". - * - * For V2 configurations and beyond, there could be any number of - * arguments. They should contain one or more key/value pairs - * separated by a space. - * - * @param arg_set The structure holding the arguments to parse - * @param error_ptr Pointer to a buffer to hold the error string - * @param config Pointer to device configuration data to update - * - * @return VDO_SUCCESS or error - */ -int parse_optional_arguments(struct dm_arg_set *arg_set, - char **error_ptr, - struct device_config *config) -{ - int result = VDO_SUCCESS; - - if (config->version == 0 || config->version == 1) { - result = parse_thread_config_string(arg_set->argv[0], - &config->thread_counts); - if (result != VDO_SUCCESS) { - *error_ptr = "Invalid thread-count configuration"; - return VDO_BAD_CONFIGURATION; - } - } else { - if ((arg_set->argc % 2) != 0) { - *error_ptr = "Odd number of optional arguments given but they should be pairs"; - return VDO_BAD_CONFIGURATION; - } - result = parse_key_value_pairs(arg_set->argc, - arg_set->argv, - config); - if (result != VDO_SUCCESS) { - *error_ptr = "Invalid optional argument configuration"; - return VDO_BAD_CONFIGURATION; - } - } - return result; -} - -/** - * Handle a parsing error. - * - * @param config The config to free - * @param error_ptr A place to store a constant string about the error - * @param error_str A constant string to store in error_ptr - **/ -static void handle_parse_error(struct device_config *config, - char **error_ptr, - char *error_str) -{ - free_vdo_device_config(config); - *error_ptr = error_str; -} - -/**********************************************************************/ -int parse_vdo_device_config(int argc, - char **argv, - struct dm_target *ti, - struct device_config **config_ptr) -{ - bool enable_512e; - struct dm_arg_set arg_set; - - char **error_ptr = &ti->error; - struct device_config *config = NULL; - int result = - UDS_ALLOCATE(1, struct device_config, "device_config", &config); - if (result != VDO_SUCCESS) { - handle_parse_error(config, - error_ptr, - "Could not allocate config structure"); - return VDO_BAD_CONFIGURATION; - } - - config->owning_target = ti; - INIT_LIST_HEAD(&config->config_list); - - // Save the original string. - result = vdo_join_strings(argv, argc, ' ', &config->original_string); - if (result != VDO_SUCCESS) { - handle_parse_error(config, - error_ptr, - "Could not populate string"); - return VDO_BAD_CONFIGURATION; - } - - // Set defaults. - // - // XXX Defaults for bio_threads and bio_rotation_interval are currently - // defined using the old configuration scheme of constants. These - // values are relied upon for performance testing on MGH machines - // currently. This should be replaced with the normally used testing - // defaults being defined in the file-based thread-configuration - // settings. The values used as defaults internally should really be - // those needed for VDO in its default shipped-product state. - config->thread_counts = (struct thread_count_config) { - .bio_ack_threads = 1, - .bio_threads = DEFAULT_NUM_BIO_SUBMIT_QUEUES, - .bio_rotation_interval = - DEFAULT_BIO_SUBMIT_QUEUE_ROTATE_INTERVAL, - .cpu_threads = 1, - .logical_zones = 0, - .physical_zones = 0, - .hash_zones = 0, - }; - config->max_discard_blocks = 1; - config->deduplication = true; - config->compression = false; - - arg_set.argc = argc; - arg_set.argv = argv; - - result = get_version_number(argc, argv, error_ptr, &config->version); - if (result != VDO_SUCCESS) { - // get_version_number sets error_ptr itself. - handle_parse_error(config, error_ptr, *error_ptr); - return result; - } - // Move the arg pointer forward only if the argument was there. - if (config->version >= 1) { - dm_shift_arg(&arg_set); - } - - result = uds_duplicate_string(dm_shift_arg(&arg_set), - "parent device name", - &config->parent_device_name); - if (result != VDO_SUCCESS) { - handle_parse_error(config, - error_ptr, - "Could not copy parent device name"); - return VDO_BAD_CONFIGURATION; - } - - // Get the physical blocks, if known. - if (config->version >= 1) { - result = kstrtoull(dm_shift_arg(&arg_set), - 10, - &config->physical_blocks); - if (result != VDO_SUCCESS) { - handle_parse_error(config, - error_ptr, - "Invalid physical block count"); - return VDO_BAD_CONFIGURATION; - } - } - - // Get the logical block size and validate - result = parse_bool(dm_shift_arg(&arg_set), - "512", - "4096", - &enable_512e); - if (result != VDO_SUCCESS) { - handle_parse_error(config, - error_ptr, - "Invalid logical block size"); - return VDO_BAD_CONFIGURATION; - } - config->logical_block_size = (enable_512e ? 512 : 4096); - - // Skip past the two no longer used read cache options. - if (config->version <= 1) { - dm_consume_args(&arg_set, 2); - } - - // Get the page cache size. - result = vdo_string_to_uint(dm_shift_arg(&arg_set), &config->cache_size); - if (result != VDO_SUCCESS) { - handle_parse_error(config, - error_ptr, - "Invalid block map page cache size"); - return VDO_BAD_CONFIGURATION; - } - - // Get the block map era length. - result = vdo_string_to_uint(dm_shift_arg(&arg_set), - &config->block_map_maximum_age); - if (result != VDO_SUCCESS) { - handle_parse_error(config, - error_ptr, - "Invalid block map maximum age"); - return VDO_BAD_CONFIGURATION; - } - - // Skip past the no longer used MD RAID5 optimization mode - if (config->version <= 2) { - dm_consume_args(&arg_set, 1); - } - - // Skip past the no longer used write policy setting - if (config->version <= 3) { - dm_consume_args(&arg_set, 1); - } - - // Skip past the no longer used pool name for older table lines - if (config->version <= 2) { - // Make sure the enum to get the pool name from argv directly - // is still in sync with the parsing of the table line. - if (&arg_set.argv[0] != - &argv[POOL_NAME_ARG_INDEX[config->version]]) { - handle_parse_error(config, - error_ptr, - "Pool name not in expected location"); - return VDO_BAD_CONFIGURATION; - } - dm_shift_arg(&arg_set); - } - - // Get the optional arguments and validate. - result = parse_optional_arguments(&arg_set, error_ptr, config); - if (result != VDO_SUCCESS) { - // parse_optional_arguments sets error_ptr itself. - handle_parse_error(config, error_ptr, *error_ptr); - return result; - } - - /* - * Logical, physical, and hash zone counts can all be zero; then we get - * one thread doing everything, our older configuration. If any zone - * count is non-zero, the others must be as well. - */ - if (((config->thread_counts.logical_zones == 0) != - (config->thread_counts.physical_zones == 0)) || - ((config->thread_counts.physical_zones == 0) != - (config->thread_counts.hash_zones == 0))) { - handle_parse_error(config, - error_ptr, - "Logical, physical, and hash zones counts must all be zero or all non-zero"); - return VDO_BAD_CONFIGURATION; - } - - if (config->cache_size < - (2 * MAXIMUM_VDO_USER_VIOS * config->thread_counts.logical_zones)) { - handle_parse_error(config, - error_ptr, - "Insufficient block map cache for logical zones"); - return VDO_BAD_CONFIGURATION; - } - - result = dm_get_device(ti, - config->parent_device_name, - dm_table_get_mode(ti->table), - &config->owned_device); - if (result != 0) { - uds_log_error("couldn't open device \"%s\": error %d", - config->parent_device_name, - result); - handle_parse_error(config, - error_ptr, - "Unable to open storage device"); - return VDO_BAD_CONFIGURATION; - } - - if (config->version == 0) { - uint64_t device_size = - i_size_read(config->owned_device->bdev->bd_inode); - - config->physical_blocks = device_size / VDO_BLOCK_SIZE; - } - - *config_ptr = config; - return result; -} - -/**********************************************************************/ -void free_vdo_device_config(struct device_config *config) -{ - if (config == NULL) { - return; - } - - if (config->owned_device != NULL) { - dm_put_device(config->owning_target, config->owned_device); - } - - UDS_FREE(config->parent_device_name); - UDS_FREE(config->index_name); - UDS_FREE(config->original_string); - - // Reduce the chance a use-after-free (as in BZ 1669960) happens to work. - memset(config, 0, sizeof(*config)); - - UDS_FREE(config); -} - -/**********************************************************************/ -void set_device_config_vdo(struct device_config *config, struct vdo *vdo) -{ - list_del_init(&config->config_list); - if (vdo != NULL) { - list_add_tail(&config->config_list, &vdo->device_config_list); - - } - - config->vdo = vdo; -} diff --git a/vdo/deviceConfig.h b/vdo/deviceConfig.h deleted file mode 100644 index 59139948..00000000 --- a/vdo/deviceConfig.h +++ /dev/null @@ -1,111 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/kernel/deviceConfig.h#15 $ - */ -#ifndef DEVICE_CONFIG_H -#define DEVICE_CONFIG_H - -#include -#include - -#include "types.h" - -#include "kernelTypes.h" - -/* - * This structure is memcmp'd for equality. Keep it packed and don't add any - * fields that are not properly set in both extant and parsed configs. - */ -struct thread_count_config { - int bio_ack_threads; - int bio_threads; - int bio_rotation_interval; - int cpu_threads; - int logical_zones; - int physical_zones; - int hash_zones; -} __packed; - -struct device_config { - struct dm_target *owning_target; - struct dm_dev *owned_device; - struct vdo *vdo; - /** All configs referencing a layer are kept on a list in the layer */ - struct list_head config_list; - char *original_string; - unsigned int version; - char *parent_device_name; - char *index_name; - block_count_t physical_blocks; - unsigned int logical_block_size; - unsigned int cache_size; - unsigned int block_map_maximum_age; - bool deduplication; - bool compression; - struct thread_count_config thread_counts; - block_count_t max_discard_blocks; -}; - -/** - * Convert a list entry to the device_config that contains it. If non-NULL, - * the list must not be empty. - * - * @param entry The list entry to convert - * - * @return The device_config wrapping the list entry - **/ -static inline struct device_config *as_vdo_device_config(struct list_head *entry) -{ - if (entry == NULL) { - return NULL; - } - return list_entry(entry, struct device_config, config_list); -} - -/** - * Convert the dmsetup table into a struct device_config. - * - * @param [in] argc The number of table values - * @param [in] argv The array of table values - * @param [in] ti The target structure for this table - * @param [out] config_ptr A pointer to return the allocated config - * - * @return VDO_SUCCESS or an error code - **/ -int __must_check parse_vdo_device_config(int argc, - char **argv, - struct dm_target *ti, - struct device_config **config_ptr); - -/** - * Free a device config created by parse_vdo_device_config(). - * - * @param config The config to free - **/ -void free_vdo_device_config(struct device_config *config); - -/** - * Acquire or release a reference from the config to a vdo. - * - * @param config The config in question - * @param vdo The vdo in question - **/ -void set_device_config_vdo(struct device_config *config, struct vdo *vdo); - -#endif // DEVICE_CONFIG_H diff --git a/vdo/deviceRegistry.c b/vdo/deviceRegistry.c deleted file mode 100644 index 110ad0be..00000000 --- a/vdo/deviceRegistry.c +++ /dev/null @@ -1,122 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/deviceRegistry.c#12 $ - */ - -#include "deviceRegistry.h" - -#include -#include -#include - -#include "memoryAlloc.h" -#include "permassert.h" - -#include "statusCodes.h" -#include "types.h" -#include "vdoInternal.h" - -/* - * We don't expect this set to ever get really large, so a linked list - * is adequate. We can use a pointer_map if we need to later. - */ -struct device_registry { - struct list_head links; - // XXX: (Some) Kernel docs say rwlocks are being deprecated in favor of - // RCU, please don't add more. Should we switch? - rwlock_t lock; -}; - -static struct device_registry registry; - -/**********************************************************************/ -void initialize_vdo_device_registry_once(void) -{ - INIT_LIST_HEAD(®istry.links); - rwlock_init(®istry.lock); -} - -/** - * Implements vdo_filter_t. - **/ -static bool vdo_is_equal(struct vdo *vdo, void *context) -{ - return ((void *) vdo == context); -} - -/** - * Find a vdo in the registry if it exists there. Must be called holding - * the lock. - * - * @param filter The filter function to apply to devices - * @param context A bit of context to provide the filter. - * - * @return the layer object found, if any - **/ -static struct vdo * __must_check -filter_vdos_locked(vdo_filter_t *filter, void *context) -{ - struct vdo *vdo; - - list_for_each_entry(vdo, ®istry.links, registration) { - if (filter(vdo, context)) { - return vdo; - } - } - - return NULL; -} - -/**********************************************************************/ -int register_vdo(struct vdo *vdo) -{ - int result; - - write_lock(®istry.lock); - result = ASSERT(filter_vdos_locked(vdo_is_equal, vdo) == NULL, - "VDO not already registered"); - if (result == VDO_SUCCESS) { - INIT_LIST_HEAD(&vdo->registration); - list_add_tail(&vdo->registration, ®istry.links); - } - write_unlock(®istry.lock); - - return result; -} - -/**********************************************************************/ -void unregister_vdo(struct vdo *vdo) -{ - write_lock(®istry.lock); - if (filter_vdos_locked(vdo_is_equal, vdo) == vdo) { - list_del_init(&vdo->registration); - } - - write_unlock(®istry.lock); -} - -/**********************************************************************/ -struct vdo *find_vdo_matching(vdo_filter_t *filter, void *context) -{ - struct vdo *vdo; - read_lock(®istry.lock); - vdo = filter_vdos_locked(filter, context); - read_unlock(®istry.lock); - return vdo; -} diff --git a/vdo/deviceRegistry.h b/vdo/deviceRegistry.h deleted file mode 100644 index dab0fbf4..00000000 --- a/vdo/deviceRegistry.h +++ /dev/null @@ -1,64 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/deviceRegistry.h#8 $ - */ - -#ifndef DEVICE_REGISTRY_H -#define DEVICE_REGISTRY_H - -#include "types.h" - -/** - * Method type for vdo matching methods. - * - * A filter function returns false if the vdo doesn't match. - **/ -typedef bool vdo_filter_t(struct vdo *vdo, void *context); - -/** - * Initialize the necessary structures for the device registry. - **/ -void initialize_vdo_device_registry_once(void); - -/** - * Register a VDO; it must not already be registered. - * - * @param vdo The vdo to register - * - * @return VDO_SUCCESS or an error - **/ -int __must_check register_vdo(struct vdo *vdo); - -/** - * Remove a vdo from the device registry. - * - * @param vdo The vdo to remove - **/ -void unregister_vdo(struct vdo *vdo); - -/** - * Find and return the first (if any) vdo matching a given filter function. - * - * @param filter The filter function to apply to vdos - * @param context A bit of context to provide the filter - **/ -struct vdo * __must_check -find_vdo_matching(vdo_filter_t *filter, void *context); - -#endif // DEVICE_REGISTRY_H diff --git a/vdo/dirtyLists.c b/vdo/dirty-lists.c similarity index 61% rename from vdo/dirtyLists.c rename to vdo/dirty-lists.c index 64015089..f4ddb614 100644 --- a/vdo/dirtyLists.c +++ b/vdo/dirty-lists.c @@ -1,31 +1,15 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/dirtyLists.c#10 $ */ -#include "dirtyLists.h" +#include "dirty-lists.h" #include "logger.h" -#include "memoryAlloc.h" +#include "memory-alloc.h" #include "permassert.h" -#include "statusCodes.h" +#include "status-codes.h" #include "types.h" struct dirty_lists { @@ -47,8 +31,16 @@ struct dirty_lists { struct list_head lists[]; }; -/**********************************************************************/ -int make_vdo_dirty_lists(block_count_t maximum_age, +/** + * vdo_make_dirty_lists() - Construct a new set of dirty lists. + * @maximum_age: The age at which an element will be expired. + * @callback: The function to call when a set of elements have expired. + * @context: The context for the callback. + * @dirty_lists_ptr: A pointer to hold the new dirty_lists structure. + * + * Return: VDO_SUCCESS or an error. + */ +int vdo_make_dirty_lists(block_count_t maximum_age, vdo_dirty_callback *callback, void *context, struct dirty_lists **dirty_lists_ptr) @@ -75,8 +67,14 @@ int make_vdo_dirty_lists(block_count_t maximum_age, return VDO_SUCCESS; } -/**********************************************************************/ -void set_vdo_dirty_lists_current_period(struct dirty_lists *dirty_lists, +/** + * vdo_set_dirty_lists_current_period() - Set the current period. + * @dirty_lists: The dirty_lists. + * @period: The current period. + * + * This function should only be called once. + */ +void vdo_set_dirty_lists_current_period(struct dirty_lists *dirty_lists, sequence_number_t period) { ASSERT_LOG_ONLY(dirty_lists->next_period == 0, "current period not set"); @@ -86,10 +84,9 @@ void set_vdo_dirty_lists_current_period(struct dirty_lists *dirty_lists, } /** - * Expire the oldest list. - * - * @param dirty_lists The dirty_lists to expire - **/ + * expire_oldest_list() - Expire the oldest list. + * @dirty_lists: The dirty_lists to expire. + */ static void expire_oldest_list(struct dirty_lists *dirty_lists) { struct list_head *dirty_list = @@ -107,11 +104,10 @@ static void expire_oldest_list(struct dirty_lists *dirty_lists) } /** - * Update the period if necessary. - * - * @param dirty_lists The dirty_lists structure - * @param period The new period - **/ + * update_period() - Update the period if necessary. + * @dirty_lists: The dirty_lists structure. + * @period: The new period. + */ static void update_period(struct dirty_lists *dirty_lists, sequence_number_t period) { @@ -125,10 +121,9 @@ static void update_period(struct dirty_lists *dirty_lists, } /** - * Write out the expired list. - * - * @param dirty_lists The dirty_lists - **/ + * write_expired_elements() - Write out the expired list. + * @dirty_lists: The dirty_lists. + */ static void write_expired_elements(struct dirty_lists *dirty_lists) { if (list_empty(&dirty_lists->expired)) { @@ -140,8 +135,16 @@ static void write_expired_elements(struct dirty_lists *dirty_lists) "no expired elements remain"); } -/**********************************************************************/ -void add_to_vdo_dirty_lists(struct dirty_lists *dirty_lists, +/** + * vdo_add_to_dirty_lists() - Add an element to the dirty lists. + * @dirty_lists: The dirty_lists structure receiving the element. + * @entry: The list entry of the element to add. + * @old_period: The period in which the element was previous dirtied, + * or 0 if it was not dirty. + * @new_period: The period in which the element has now been dirtied, + * or 0 if it does not hold a lock. + */ +void vdo_add_to_dirty_lists(struct dirty_lists *dirty_lists, struct list_head *entry, sequence_number_t old_period, sequence_number_t new_period) @@ -163,16 +166,28 @@ void add_to_vdo_dirty_lists(struct dirty_lists *dirty_lists, write_expired_elements(dirty_lists); } -/**********************************************************************/ -void advance_vdo_dirty_lists_period(struct dirty_lists *dirty_lists, +/** + * vdo_advance_dirty_lists_period() - Advance the current period. + * @dirty_lists: The dirty_lists to advance. + * @period: The new current period. + * + * If the current period is greater than the number of lists, expire + * the oldest lists. + */ +void vdo_advance_dirty_lists_period(struct dirty_lists *dirty_lists, sequence_number_t period) { update_period(dirty_lists, period); write_expired_elements(dirty_lists); } -/**********************************************************************/ -void flush_vdo_dirty_lists(struct dirty_lists *dirty_lists) +/** + * vdo_flush_dirty_lists() - Flush all dirty lists. + * @dirty_lists: The dirty_lists to flush. + * + * This will cause the period to be advanced past the current period. + */ +void vdo_flush_dirty_lists(struct dirty_lists *dirty_lists) { while (dirty_lists->oldest_period < dirty_lists->next_period) { expire_oldest_list(dirty_lists); diff --git a/vdo/dirty-lists.h b/vdo/dirty-lists.h new file mode 100644 index 00000000..fd3caf13 --- /dev/null +++ b/vdo/dirty-lists.h @@ -0,0 +1,53 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright Red Hat + */ + +#ifndef DIRTY_LISTS_H +#define DIRTY_LISTS_H + +#include + +#include "types.h" + +/** + * struct dirty_lists - Lists of dirty elements. + * + * A collection of lists of dirty elements ordered by age. An element is always + * placed on the oldest list in which it was dirtied (moving between lists or + * removing altogether is cheap). Whenever the current period is advanced, any + * elements older than the maxium age are expired. If an element is to be added + * with a dirty age older than the maximum age, it is expired immediately. + */ +struct dirty_lists; + +/** + * typedef vdo_dirty_callback - Callback for processing dirty elements. + * @expired: The list of expired elements. + * @context: The context for the callback. + * + * A function which will be called with a ring of dirty elements which have + * been expired. All of the expired elements must be removed from the ring + * before this function returns. + */ +typedef void vdo_dirty_callback(struct list_head *expired, void *context); + +int __must_check vdo_make_dirty_lists(block_count_t maximum_age, + vdo_dirty_callback *callback, + void *context, + struct dirty_lists **dirty_lists_ptr); + +void vdo_set_dirty_lists_current_period(struct dirty_lists *dirty_lists, + sequence_number_t period); + +void vdo_add_to_dirty_lists(struct dirty_lists *dirty_lists, + struct list_head *entry, + sequence_number_t old_period, + sequence_number_t new_period); + +void vdo_advance_dirty_lists_period(struct dirty_lists *dirty_lists, + sequence_number_t period); + +void vdo_flush_dirty_lists(struct dirty_lists *dirty_lists); + +#endif /* DIRTY_LISTS_H */ diff --git a/vdo/dirtyLists.h b/vdo/dirtyLists.h deleted file mode 100644 index 268d4304..00000000 --- a/vdo/dirtyLists.h +++ /dev/null @@ -1,106 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/dirtyLists.h#8 $ - */ - -#ifndef DIRTY_LISTS_H -#define DIRTY_LISTS_H - -#include - -#include "types.h" - -/** - * A collection of lists of dirty elements ordered by age. An element is always - * placed on the oldest list in which it was dirtied (moving between lists or - * removing altogether is cheap). Whenever the current period is advanced, any - * elements older than the maxium age are expired. If an element is to be added - * with a dirty age older than the maximum age, it is expired immediately. - **/ -struct dirty_lists; - -/** - * A function which will be called with a ring of dirty elements which have - * been expired. All of the expired elements must be removed from the ring - * before this function returns. - * - * @param expired The list of expired elements - * @param context The context for the callback - **/ -typedef void vdo_dirty_callback(struct list_head *expired, void *context); - -/** - * Construct a new set of dirty lists. - * - * @param [in] maximum_age The age at which an element will be expired - * @param [in] callback The function to call when a set of elements - * have expired - * @param [in] context The context for the callback - * @param [out] dirty_lists_ptr A pointer to hold the new dirty_lists structure - * - * @return VDO_SUCCESS or an error - **/ -int __must_check make_vdo_dirty_lists(block_count_t maximum_age, - vdo_dirty_callback *callback, - void *context, - struct dirty_lists **dirty_lists_ptr); - -/** - * Set the current period. This function should only be called once. - * - * @param dirty_lists The dirty_lists - * @param period The current period - **/ -void set_vdo_dirty_lists_current_period(struct dirty_lists *dirty_lists, - sequence_number_t period); - -/** - * Add an element to the dirty lists. - * - * @param dirty_lists The dirty_lists structure receiving the element - * @param entry The list entry of the element to add - * @param old_period The period in which the element was previous dirtied, - * or 0 if it was not dirty - * @param new_period The period in which the element has now been dirtied, - * or 0 if it does not hold a lock - **/ -void add_to_vdo_dirty_lists(struct dirty_lists *dirty_lists, - struct list_head *entry, - sequence_number_t old_period, - sequence_number_t new_period); - -/** - * Advance the current period. If the current period is greater than the number - * of lists, expire the oldest lists. - * - * @param dirty_lists The dirty_lists to advance - * @param period The new current period - **/ -void advance_vdo_dirty_lists_period(struct dirty_lists *dirty_lists, - sequence_number_t period); - -/** - * Flush all dirty lists. This will cause the period to be advanced past the - * current period. - * - * @param dirty_lists The dirty_lists to flush - **/ -void flush_vdo_dirty_lists(struct dirty_lists *dirty_lists); - -#endif // DIRTY_LISTS_H diff --git a/vdo/dmvdo.c b/vdo/dm-vdo-target.c similarity index 51% rename from vdo/dmvdo.c rename to vdo/dm-vdo-target.c index 284ebad0..045d3934 100644 --- a/vdo/dmvdo.c +++ b/vdo/dm-vdo-target.c @@ -1,111 +1,67 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/kernel/dmvdo.c#66 $ */ -#include "dmvdo.h" - -#include #include -#include "logger.h" -#include "memoryAlloc.h" -#include "threadDevice.h" -#include "threadRegistry.h" - +#include "bio.h" #include "constants.h" -#include "threadConfig.h" -#include "vdo.h" - -#include "dedupeIndex.h" -#include "deviceRegistry.h" +#include "data-vio-pool.h" +#include "dedupe-index.h" +#include "device-registry.h" #include "dump.h" -#include "instanceNumber.h" -#include "ioSubmitter.h" -#include "kernelLayer.h" +#include "flush.h" +#include "instance-number.h" +#include "io-submitter.h" +#include "logger.h" +#include "memory-alloc.h" #include "messageStats.h" -#include "stringUtils.h" +#include "string-utils.h" +#include "thread-config.h" +#include "thread-device.h" +#include "thread-registry.h" +#include "uds-sysfs.h" #include "vdo.h" -#include "vdoInit.h" -#include "vdoLoad.h" +#include "vdo-load.h" +#include "vdo-resume.h" +#include "vdo-suspend.h" -enum vdo_module_status vdo_module_status; - -/**********************************************************************/ - -/** - * Get the vdo associated with a dm target structure. - * - * @param ti The dm target structure - * - * @return The vdo NULL. - **/ static struct vdo *get_vdo_for_target(struct dm_target *ti) { return ((struct device_config *) ti->private)->vdo; } -/** - * Get the kernel layer associated with a dm target structure. - * - * @param ti The dm target structure - * - * @return The kernel layer, or NULL. - **/ -static struct kernel_layer *get_kernel_layer_for_target(struct dm_target *ti) -{ - return vdo_as_kernel_layer(get_vdo_for_target(ti)); -} - -/** - * Begin VDO processing of a bio. This is called by the device mapper - * through the "map" function, and has resulted from a bio being - * submitted. - * - * @param ti The dm_target. We only need the "private" member to access - * the vdo - * @param bio The bio. - * - * @return One of these values: - * - * negative A negative value is an error code. - * Usually -EIO. - * - * DM_MAPIO_SUBMITTED VDO will take care of this I/O, either - * processing it completely and calling - * bio_endio, or forwarding it onward by - * submitting it to the next layer. - * - * DM_MAPIO_REMAPPED VDO has modified the bio and the device - * mapper will immediately forward the bio - * onward by submitting it to the next layer. - * - * DM_MAPIO_REQUEUE We do not use this. It is used by device - * mapper devices to defer an I/O request - * during suspend/resume processing. - **/ static int vdo_map_bio(struct dm_target *ti, struct bio *bio) { - return vdo_launch_bio(get_vdo_for_target(ti), bio); + struct vdo *vdo = get_vdo_for_target(ti); + struct vdo_work_queue *current_work_queue; + const struct admin_state_code *code + = vdo_get_admin_state_code(&vdo->admin_state); + + ASSERT_LOG_ONLY(code->normal, + "vdo should not receive bios while in state %s", + code->name); + + /* Count all incoming bios. */ + vdo_count_bios(&vdo->stats.bios_in, bio); + + + /* Handle empty bios. Empty flush bios are not associated with a vio. */ + if ((bio_op(bio) == REQ_OP_FLUSH) || + ((bio->bi_opf & REQ_PREFLUSH) != 0)) { + vdo_launch_flush(vdo, bio); + return DM_MAPIO_SUBMITTED; + } + + /* This could deadlock, */ + current_work_queue = get_current_work_queue(); + BUG_ON((current_work_queue != NULL) + && (vdo == get_work_queue_owner(current_work_queue)->vdo)); + vdo_launch_bio(vdo->data_vio_pool, bio); + return DM_MAPIO_SUBMITTED; } -/**********************************************************************/ static void vdo_io_hints(struct dm_target *ti, struct queue_limits *limits) { struct vdo *vdo = get_vdo_for_target(ti); @@ -113,9 +69,9 @@ static void vdo_io_hints(struct dm_target *ti, struct queue_limits *limits) limits->logical_block_size = vdo->device_config->logical_block_size; limits->physical_block_size = VDO_BLOCK_SIZE; - // The minimum io size for random io + /* The minimum io size for random io */ blk_limits_io_min(limits, VDO_BLOCK_SIZE); - // The optimal io size for streamed/sequential io + /* The optimal io size for streamed/sequential io */ blk_limits_io_opt(limits, VDO_BLOCK_SIZE); /* @@ -138,20 +94,24 @@ static void vdo_io_hints(struct dm_target *ti, struct queue_limits *limits) limits->max_discard_sectors = (vdo->device_config->max_discard_blocks * VDO_SECTORS_PER_BLOCK); - // Force discards to not begin or end with a partial block by stating - // the granularity is 4k. + /* + * Force discards to not begin or end with a partial block by stating + * the granularity is 4k. + */ limits->discard_granularity = VDO_BLOCK_SIZE; } -/**********************************************************************/ static int vdo_iterate_devices(struct dm_target *ti, iterate_devices_callout_fn fn, void *data) { - struct vdo *vdo = get_vdo_for_target(ti); - sector_t len = block_to_sector(vdo->device_config->physical_blocks); + struct device_config *config = get_vdo_for_target(ti)->device_config; - return fn(ti, vdo->device_config->owned_device, 0, len, data); + return fn(ti, + config->owned_device, + 0, + config->physical_blocks * VDO_SECTORS_PER_BLOCK, + data); } /* @@ -160,7 +120,6 @@ static int vdo_iterate_devices(struct dm_target *ti, * */ -/**********************************************************************/ static void vdo_status(struct dm_target *ti, status_type_t status_type, unsigned int status_flags, @@ -171,84 +130,66 @@ static void vdo_status(struct dm_target *ti, struct vdo_statistics *stats; struct device_config *device_config; char name_buffer[BDEVNAME_SIZE]; - // N.B.: The DMEMIT macro uses the variables named "sz", "result", - // "maxlen". + /* + * N.B.: The DMEMIT macro uses the variables named "sz", "result", + * "maxlen". + */ int sz = 0; switch (status_type) { case STATUSTYPE_INFO: - // Report info for dmsetup status + /* Report info for dmsetup status */ mutex_lock(&vdo->stats_mutex); - fetch_vdo_statistics(vdo, &vdo->stats_buffer); + vdo_fetch_statistics(vdo, &vdo->stats_buffer); stats = &vdo->stats_buffer; DMEMIT("/dev/%s %s %s %s %s %llu %llu", - bdevname(get_vdo_backing_device(vdo), name_buffer), + bdevname(vdo_get_backing_device(vdo), name_buffer), stats->mode, stats->in_recovery_mode ? "recovering" : "-", - get_vdo_dedupe_index_state_name(vdo->dedupe_index), - get_vdo_compressing(vdo) ? "online" : "offline", + vdo_get_dedupe_index_state_name(vdo->dedupe_index), + vdo_get_compressing(vdo) ? "online" : "offline", stats->data_blocks_used + stats->overhead_blocks_used, stats->physical_blocks); mutex_unlock(&vdo->stats_mutex); break; case STATUSTYPE_TABLE: - // Report the string actually specified in the beginning. + /* Report the string actually specified in the beginning. */ device_config = (struct device_config *) ti->private; DMEMIT("%s", device_config->original_string); break; - // XXX We ought to print more detailed output here, but this is what - // thin does. + /* + * FIXME: We ought to print more detailed output here, but this is what + * thin does. + */ case STATUSTYPE_IMA: *result = '\0'; break; } } - -/** - * Get the size of a vdo's underlying device, in blocks. - * - * @param vdo The vdo - * - * @return The size in blocks - **/ static block_count_t __must_check get_underlying_device_block_count(const struct vdo *vdo) { - return (i_size_read(get_vdo_backing_device(vdo)->bd_inode) + return (i_size_read(vdo_get_backing_device(vdo)->bd_inode) / VDO_BLOCK_SIZE); } -/** - * Process a dmsetup message now that we know no other message is being - * processed. - * - * @param vdo The vdo to which the message was sent - * @param argc The argument count of the message - * @param argv The arguments to the message - * - * @return -EINVAL if the message is unrecognized or the result of processing - * the message - **/ static int __must_check process_vdo_message_locked(struct vdo *vdo, unsigned int argc, char **argv) { - // Messages with fixed numbers of arguments. - switch (argc) { - - case 2: + if (argc == 2) { if (strcasecmp(argv[0], "compression") == 0) { if (strcasecmp(argv[1], "on") == 0) { - set_vdo_compressing(vdo, true); + vdo_set_compressing(vdo, true); return 0; } if (strcasecmp(argv[1], "off") == 0) { - set_vdo_compressing(vdo, false); + vdo_set_compressing(vdo, false); return 0; } @@ -256,29 +197,17 @@ process_vdo_message_locked(struct vdo *vdo, argv[1]); return -EINVAL; } - - break; - - - default: - break; } uds_log_warning("unrecognized dmsetup message '%s' received", argv[0]); return -EINVAL; } -/** - * Process a dmsetup message. If the message is a dump, just do it. Otherwise, - * check that no other message is being processed, and only proceed if so. - * - * @param vdo The vdo to which the message was sent - * @param argc The argument count of the message - * @param argv The arguments to the message - * - * @return -EBUSY if another message is being processed or the result of - * processsing the message - **/ +/* + * If the message is a dump, just do it. Otherwise, check that no other message + * is being processed, and only proceed if so. + * Returns -EBUSY if another message is being processed + */ static int __must_check process_vdo_message(struct vdo *vdo, unsigned int argc, char **argv) { @@ -291,7 +220,7 @@ process_vdo_message(struct vdo *vdo, unsigned int argc, char **argv) * process_vdo_message_locked(). */ - // Dump messages should always be processed + /* Dump messages should always be processed */ if (strcasecmp(argv[0], "dump") == 0) { return vdo_dump(vdo, argc, argv, "dmsetup message"); } @@ -302,12 +231,12 @@ process_vdo_message(struct vdo *vdo, unsigned int argc, char **argv) return 0; } - // Index messages should always be processed + /* Index messages should always be processed */ if ((strcasecmp(argv[0], "index-close") == 0) || (strcasecmp(argv[0], "index-create") == 0) || (strcasecmp(argv[0], "index-disable") == 0) || (strcasecmp(argv[0], "index-enable") == 0)) { - return message_vdo_dedupe_index(vdo->dedupe_index, + return vdo_message_dedupe_index(vdo->dedupe_index, argv[0]); } } @@ -323,7 +252,6 @@ process_vdo_message(struct vdo *vdo, unsigned int argc, char **argv) return result; } -/**********************************************************************/ static int vdo_message(struct dm_target *ti, unsigned int argc, char **argv, @@ -343,16 +271,18 @@ static int vdo_message(struct dm_target *ti, uds_register_allocating_thread(&allocating_thread, NULL); uds_register_thread_device_id(&instance_thread, &vdo->instance); - // Must be done here so we don't map return codes. The code in - // dm-ioctl expects a 1 for a return code to look at the buffer - // and see if it is full or not. + /* + * Must be done here so we don't map return codes. The code in + * dm-ioctl expects a 1 for a return code to look at the buffer + * and see if it is full or not. + */ if ((argc == 1) && (strcasecmp(argv[0], "stats") == 0)) { - write_vdo_stats(vdo, result_buffer, maxlen); + vdo_write_stats(vdo, result_buffer, maxlen); result = 1; } else { - result = map_to_system_error(process_vdo_message(vdo, - argc, - argv)); + result = vdo_map_to_system_error(process_vdo_message(vdo, + argc, + argv)); } uds_unregister_thread_device_id(); @@ -360,11 +290,6 @@ static int vdo_message(struct dm_target *ti, return result; } -/** - * Configure the dm_target with our capabilities. - * - * @param ti The device mapper target representing our device - **/ static void configure_target_capabilities(struct dm_target *ti) { ti->discards_supported = 1; @@ -372,45 +297,36 @@ static void configure_target_capabilities(struct dm_target *ti) ti->num_discard_bios = 1; ti->num_flush_bios = 1; - // If this value changes, please make sure to update the - // value for max_discard_sectors accordingly. + /* + * If this value changes, please make sure to update the + * value for max_discard_sectors accordingly. + */ BUG_ON(dm_set_target_max_io_len(ti, VDO_SECTORS_PER_BLOCK) != 0); } -/** +/* * Implements vdo_filter_t. - **/ + */ static bool vdo_uses_device(struct vdo *vdo, void *context) { struct device_config *config = context; - return (get_vdo_backing_device(vdo)->bd_dev + + return (vdo_get_backing_device(vdo)->bd_dev == config->owned_device->bdev->bd_dev); } -/** - * Initializes a single VDO instance and loads the data from disk - * - * @param ti The device mapper target representing our device - * @param instance The device instantiation counter - * @param config The parsed config for the instance - * - * @return VDO_SUCCESS or an error code - * - **/ static int vdo_initialize(struct dm_target *ti, unsigned int instance, struct device_config *config) { - char *failure_reason; struct vdo *vdo; - struct kernel_layer *layer; int result; uint64_t block_size = VDO_BLOCK_SIZE; uint64_t logical_size = to_bytes(ti->len); block_count_t logical_blocks = logical_size / block_size; - uds_log_info("loading device '%s'", get_vdo_device_name(ti)); + uds_log_info("loading device '%s'", vdo_get_device_name(ti)); uds_log_debug("Logical block size = %llu", (uint64_t) config->logical_block_size); uds_log_debug("Logical blocks = %llu", logical_blocks); @@ -425,28 +341,25 @@ static int vdo_initialize(struct dm_target *ti, (config->compression ? "on" : "off")); - vdo = find_vdo_matching(vdo_uses_device, config); + vdo = vdo_find_matching(vdo_uses_device, config); if (vdo != NULL) { uds_log_error("Existing vdo already uses device %s", vdo->device_config->parent_device_name); - release_vdo_instance(instance); + vdo_release_instance(instance); ti->error = "Cannot share storage device with already-running VDO"; return VDO_BAD_CONFIGURATION; } - result = make_kernel_layer(instance, - config, - &failure_reason, - &layer); + result = vdo_make(instance, config, &ti->error, &vdo); if (result != VDO_SUCCESS) { uds_log_error("Could not create VDO device. (VDO error %d, message %s)", result, - failure_reason); - ti->error = failure_reason; + ti->error); + vdo_destroy(vdo); return result; } - result = prepare_to_load_vdo(&layer->vdo); + result = vdo_prepare_to_load(vdo); if (result != VDO_SUCCESS) { ti->error = ((result == VDO_INVALID_ADMIN_STATE) ? "Pre-load is only valid immediately after initialization" @@ -454,28 +367,28 @@ static int vdo_initialize(struct dm_target *ti, uds_log_error("Could not start VDO device. (VDO error %d, message %s)", result, ti->error); - free_kernel_layer(layer); + vdo_destroy(vdo); return result; } - set_device_config_vdo(config, &layer->vdo); - set_vdo_active_config(&layer->vdo, config); + vdo_set_device_config(config, vdo); + vdo->device_config = config; ti->private = config; configure_target_capabilities(ti); return VDO_SUCCESS; } -/** +/* * Implements vdo_filter_t. - **/ + */ static bool __must_check vdo_is_named(struct vdo *vdo, void *context) { struct dm_target *ti = vdo->device_config->owning_target; - const char *device_name = get_vdo_device_name(ti); + const char *device_name = vdo_get_device_name(ti); + return (strcmp(device_name, (const char *) context) == 0); } -/**********************************************************************/ static int vdo_ctr(struct dm_target *ti, unsigned int argc, char **argv) { int result = VDO_SUCCESS; @@ -487,10 +400,10 @@ static int vdo_ctr(struct dm_target *ti, unsigned int argc, char **argv) uds_register_allocating_thread(&allocating_thread, NULL); - device_name = get_vdo_device_name(ti); - old_vdo = find_vdo_matching(vdo_is_named, (void *) device_name); + device_name = vdo_get_device_name(ti); + old_vdo = vdo_find_matching(vdo_is_named, (void *) device_name); if (old_vdo == NULL) { - result = allocate_vdo_instance(&instance); + result = vdo_allocate_instance(&instance); if (result != VDO_SUCCESS) { uds_unregister_allocating_thread(); return -ENOMEM; @@ -500,35 +413,31 @@ static int vdo_ctr(struct dm_target *ti, unsigned int argc, char **argv) } uds_register_thread_device_id(&instance_thread, &instance); - result = parse_vdo_device_config(argc, argv, ti, &config); + result = vdo_parse_device_config(argc, argv, ti, &config); if (result != VDO_SUCCESS) { uds_unregister_thread_device_id(); uds_unregister_allocating_thread(); if (old_vdo == NULL) { - release_vdo_instance(instance); + vdo_release_instance(instance); } return -EINVAL; } - // Is there already a device of this name? + /* Is there already a device of this name? */ if (old_vdo != NULL) { - struct kernel_layer *layer = vdo_as_kernel_layer(old_vdo); - /* - * To preserve backward compatibility with old VDO Managers, we - * need to allow this to happen when either suspended or not. - * We could assert that if the config is version 0, we are - * suspended, and if not, we are not, but we can't do that - * until new VDO Manager does the right order. - */ + bool may_grow = (vdo_get_admin_state(old_vdo) + != VDO_ADMIN_STATE_PRE_LOADED); + uds_log_info("preparing to modify device '%s'", device_name); - result = prepare_to_modify_kernel_layer(layer, - config, - &ti->error); + result = vdo_prepare_to_modify(old_vdo, + config, + may_grow, + &ti->error); if (result != VDO_SUCCESS) { - result = map_to_system_error(result); - free_vdo_device_config(UDS_FORGET(config)); + result = vdo_map_to_system_error(result); + vdo_free_device_config(UDS_FORGET(config)); } else { - set_device_config_vdo(config, old_vdo); + vdo_set_device_config(config, old_vdo); ti->private = config; configure_target_capabilities(ti); } @@ -539,9 +448,9 @@ static int vdo_ctr(struct dm_target *ti, unsigned int argc, char **argv) result = vdo_initialize(ti, instance, config); if (result != VDO_SUCCESS) { - // vdo_initialize calls into various VDO routines, so map error - result = map_to_system_error(result); - free_vdo_device_config(config); + /* vdo_initialize calls into various VDO routines, so map error */ + result = vdo_map_to_system_error(result); + vdo_free_device_config(config); } uds_unregister_thread_device_id(); @@ -549,90 +458,66 @@ static int vdo_ctr(struct dm_target *ti, unsigned int argc, char **argv) return result; } -/**********************************************************************/ static void vdo_dtr(struct dm_target *ti) { struct device_config *config = ti->private; struct vdo *vdo = config->vdo; - set_device_config_vdo(config, NULL); + vdo_set_device_config(config, NULL); if (list_empty(&vdo->device_config_list)) { const char *device_name; - // This was the last config referencing the VDO. Free it. + /* This was the last config referencing the VDO. Free it. */ unsigned int instance = vdo->instance; struct registered_thread allocating_thread, instance_thread; uds_register_thread_device_id(&instance_thread, &instance); uds_register_allocating_thread(&allocating_thread, NULL); - device_name = get_vdo_device_name(ti); + device_name = vdo_get_device_name(ti); uds_log_info("stopping device '%s'", device_name); if (vdo->dump_on_shutdown) { vdo_dump_all(vdo, "device shutdown"); } - free_kernel_layer(vdo_as_kernel_layer(vdo)); + vdo_destroy(UDS_FORGET(vdo)); uds_log_info("device '%s' stopped", device_name); uds_unregister_thread_device_id(); uds_unregister_allocating_thread(); } else if (config == vdo->device_config) { - // The VDO still references this config. Give it a reference - // to a config that isn't being destroyed. + /* + * The VDO still references this config. Give it a reference + * to a config that isn't being destroyed. + */ vdo->device_config = - as_vdo_device_config(vdo->device_config_list.next); + vdo_as_device_config(vdo->device_config_list.next); } - free_vdo_device_config(config); + vdo_free_device_config(config); ti->private = NULL; } -/**********************************************************************/ static void vdo_presuspend(struct dm_target *ti) { - struct vdo *vdo = get_vdo_for_target(ti); - struct registered_thread instance_thread; - - uds_register_thread_device_id(&instance_thread, &vdo->instance); - if (dm_noflush_suspending(ti)) { - vdo->no_flush_suspend = true; - } - uds_unregister_thread_device_id(); + get_vdo_for_target(ti)->suspend_type + = (dm_noflush_suspending(ti) + ? VDO_ADMIN_STATE_SUSPENDING + : VDO_ADMIN_STATE_SAVING); } -/**********************************************************************/ static void vdo_postsuspend(struct dm_target *ti) { - struct kernel_layer *layer = get_kernel_layer_for_target(ti); + struct vdo *vdo = get_vdo_for_target(ti); struct registered_thread instance_thread; - const char *device_name; - int result; - - uds_register_thread_device_id(&instance_thread, &layer->vdo.instance); - device_name = get_vdo_device_name(ti); - uds_log_info("suspending device '%s'", device_name); - result = suspend_kernel_layer(layer); - - // Treat VDO_READ_ONLY as a success since a read-only suspension still - // leaves the VDO suspended. - if ((result == VDO_SUCCESS) || (result == VDO_READ_ONLY)) { - uds_log_info("device '%s' suspended", device_name); - } else { - uds_log_error("suspend of device '%s' failed with error: %d", - device_name, - result); - } - - layer->vdo.no_flush_suspend = false; + uds_register_thread_device_id(&instance_thread, &vdo->instance); + vdo_suspend(vdo); uds_unregister_thread_device_id(); } -/**********************************************************************/ static int vdo_preresume(struct dm_target *ti) { struct vdo *vdo = get_vdo_for_target(ti); - struct kernel_layer *layer = vdo_as_kernel_layer(vdo); struct device_config *config = ti->private; struct registered_thread instance_thread; const char *device_name; @@ -640,10 +525,11 @@ static int vdo_preresume(struct dm_target *ti) int result; uds_register_thread_device_id(&instance_thread, &vdo->instance); - device_name = get_vdo_device_name(ti); + device_name = vdo_get_device_name(ti); backing_blocks = get_underlying_device_block_count(vdo); if (backing_blocks < config->physical_blocks) { + /* FIXME: can this still happen? */ uds_log_error("resume of device '%s' failed: backing device has %llu blocks but VDO physical size is %llu blocks", device_name, (unsigned long long) backing_blocks, @@ -652,61 +538,33 @@ static int vdo_preresume(struct dm_target *ti) return -EINVAL; } - if (get_vdo_admin_state(vdo) == VDO_ADMIN_STATE_PRE_LOADED) { - char *failure_reason; - int result; - - // This is the first time this device has been resumed, so run - // it. - uds_log_info("starting device '%s'", device_name); - result = start_kernel_layer(layer, &failure_reason); - + if (vdo_get_admin_state(vdo) == VDO_ADMIN_STATE_PRE_LOADED) { + result = vdo_load(vdo); if (result != VDO_SUCCESS) { - uds_log_error("Could not start VDO device. (VDO error %d, message %s)", - result, - failure_reason); - vdo_enter_read_only_mode(vdo->read_only_notifier, - result); uds_unregister_thread_device_id(); - return map_to_system_error(result); + return vdo_map_to_system_error(result); } - uds_log_info("device '%s' started", device_name); } uds_log_info("resuming device '%s'", device_name); - - // This is a noop if nothing has changed, and by calling it every time - // we capture old-style growPhysicals, which change the config in place. - result = modify_kernel_layer(layer, config); - - if (result != VDO_SUCCESS) { - uds_log_error_strerror(result, - "Commit of modifications to device '%s' failed", - device_name); - set_vdo_active_config(vdo, config); - vdo_enter_read_only_mode(vdo->read_only_notifier, result); - } else { - set_vdo_active_config(vdo, config); - result = resume_kernel_layer(layer); - if (result != VDO_SUCCESS) { - uds_log_error("resume of device '%s' failed with error: %d", - device_name, result); - } + result = vdo_preresume_internal(vdo, config, device_name); + if ((result == VDO_PARAMETER_MISMATCH) + || (result == VDO_INVALID_ADMIN_STATE)) { + result = -EINVAL; } uds_unregister_thread_device_id(); - return map_to_system_error(result); + return vdo_map_to_system_error(result); } -/**********************************************************************/ static void vdo_resume(struct dm_target *ti) { - struct kernel_layer *layer = get_kernel_layer_for_target(ti); struct registered_thread instance_thread; - uds_register_thread_device_id(&instance_thread, &layer->vdo.instance); - uds_log_info("device '%s' resumed", get_vdo_device_name(ti)); + uds_register_thread_device_id(&instance_thread, + &get_vdo_for_target(ti)->instance); + uds_log_info("device '%s' resumed", vdo_get_device_name(ti)); uds_unregister_thread_device_id(); } @@ -719,7 +577,7 @@ static void vdo_resume(struct dm_target *ti) static struct target_type vdo_target_bio = { .features = DM_TARGET_SINGLETON, .name = "vdo", - .version = { 8, 1, 0 }, + .version = { 8, 2, 0 }, .module = THIS_MODULE, .ctr = vdo_ctr, .dtr = vdo_dtr, @@ -736,60 +594,64 @@ static struct target_type vdo_target_bio = { static bool dm_registered; -/**********************************************************************/ -static void vdo_destroy(void) +static void vdo_module_destroy(void) { uds_log_debug("in %s", __func__); - vdo_module_status = VDO_MODULE_SHUTTING_DOWN; - - vdo_module_status = VDO_MODULE_UNINITIALIZED; - if (dm_registered) { dm_unregister_target(&vdo_target_bio); } - clean_up_vdo_instance_number_tracking(); + vdo_clean_up_instance_number_tracking(); uds_log_info("unloaded version %s", CURRENT_VERSION); } -/**********************************************************************/ static int __init vdo_init(void) { int result = 0; - initialize_vdo_device_registry_once(); + /* + * UDS module level initialization must be done first, as VDO + * initialization depends on it + */ + uds_initialize_thread_device_registry(); + uds_memory_init(); + uds_init_sysfs(); + + vdo_initialize_device_registry_once(); uds_log_info("loaded version %s", CURRENT_VERSION); - // Add VDO errors to the already existing set of errors in UDS. - result = register_vdo_status_codes(); + /* Add VDO errors to the already existing set of errors in UDS. */ + result = vdo_register_status_codes(); if (result != UDS_SUCCESS) { - uds_log_error("register_vdo_status_codes failed %d", result); - vdo_destroy(); + uds_log_error("vdo_register_status_codes failed %d", result); + vdo_module_destroy(); return result; } result = dm_register_target(&vdo_target_bio); if (result < 0) { uds_log_error("dm_register_target failed %d", result); - vdo_destroy(); + vdo_module_destroy(); return result; } dm_registered = true; - vdo_module_status = VDO_MODULE_UNINITIALIZED; - - initialize_vdo_instance_number_tracking(); + vdo_initialize_instance_number_tracking(); - vdo_module_status = VDO_MODULE_READY; return result; } -/**********************************************************************/ static void __exit vdo_exit(void) { - vdo_destroy(); + vdo_module_destroy(); + /* + * UDS module level exit processing must be done after all VDO + * module exit processing is complete. + */ + uds_put_sysfs(); + uds_memory_exit(); } module_init(vdo_init); diff --git a/vdo/dmvdo.h b/vdo/dmvdo.h deleted file mode 100644 index 06c9c45c..00000000 --- a/vdo/dmvdo.h +++ /dev/null @@ -1,39 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/kernel/dmvdo.h#3 $ - */ - -#ifndef DMVDO_H -#define DMVDO_H - -#include -#include -#include - -#include "kernelLayer.h" - -enum vdo_module_status { - VDO_MODULE_UNINITIALIZED = 0, - VDO_MODULE_READY, - VDO_MODULE_SHUTTING_DOWN, -}; - -extern enum vdo_module_status vdo_module_status; - -#endif /* DMVDO_H */ diff --git a/vdo/dump.c b/vdo/dump.c index c1698f0a..0b1041bd 100644 --- a/vdo/dump.c +++ b/vdo/dump.c @@ -1,88 +1,60 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/kernel/dump.c#30 $ */ #include "dump.h" #include -#include "memoryAlloc.h" -#include "typeDefs.h" +#include "memory-alloc.h" +#include "type-defs.h" #include "constants.h" -#include "vdo.h" - -#include "dedupeIndex.h" -#include "histogram.h" -#include "ioSubmitter.h" +#include "data-vio.h" +#include "dedupe-index.h" +#include "io-submitter.h" +#include "kernel-types.h" #include "logger.h" -#include "vdoInit.h" +#include "types.h" +#include "vdo.h" enum dump_options { - // Work queues - SHOW_BIO_ACK_QUEUE, - SHOW_BIO_QUEUE, - SHOW_CPU_QUEUES, - SHOW_INDEX_QUEUE, - SHOW_REQUEST_QUEUE, - // Memory pools + /* Work queues */ + SHOW_QUEUES, + /* Memory pools */ SHOW_VIO_POOL, - // Others + /* Others */ SHOW_VDO_STATUS, - // This one means an option overrides the "default" choices, instead - // of altering them. + /* + * This one means an option overrides the "default" choices, instead + * of altering them. + */ SKIP_DEFAULT }; enum dump_option_flags { - // Work queues - FLAG_SHOW_BIO_ACK_QUEUE = (1 << SHOW_BIO_ACK_QUEUE), - FLAG_SHOW_BIO_QUEUE = (1 << SHOW_BIO_QUEUE), - FLAG_SHOW_CPU_QUEUES = (1 << SHOW_CPU_QUEUES), - FLAG_SHOW_INDEX_QUEUE = (1 << SHOW_INDEX_QUEUE), - FLAG_SHOW_REQUEST_QUEUE = (1 << SHOW_REQUEST_QUEUE), - // Memory pools + /* Work queues */ + FLAG_SHOW_QUEUES = (1 << SHOW_QUEUES), + /* Memory pools */ FLAG_SHOW_VIO_POOL = (1 << SHOW_VIO_POOL), - // Others + /* Others */ FLAG_SHOW_VDO_STATUS = (1 << SHOW_VDO_STATUS), - // Special + /* Special */ FLAG_SKIP_DEFAULT = (1 << SKIP_DEFAULT) }; enum { FLAGS_ALL_POOLS = (FLAG_SHOW_VIO_POOL), - FLAGS_ALL_QUEUES = (FLAG_SHOW_REQUEST_QUEUE | FLAG_SHOW_INDEX_QUEUE | - FLAG_SHOW_BIO_ACK_QUEUE | FLAG_SHOW_BIO_QUEUE | - FLAG_SHOW_CPU_QUEUES), - FLAGS_ALL_THREADS = (FLAGS_ALL_QUEUES), - DEFAULT_DUMP_FLAGS = (FLAGS_ALL_THREADS | FLAG_SHOW_VDO_STATUS) + DEFAULT_DUMP_FLAGS = (FLAG_SHOW_QUEUES | FLAG_SHOW_VDO_STATUS) }; -/**********************************************************************/ static inline bool is_arg_string(const char *arg, const char *this_option) { - // device-mapper convention seems to be case-independent options + /* convention seems to be case-independent options */ return strncasecmp(arg, this_option, strlen(this_option)) == 0; } -/**********************************************************************/ static void do_dump(struct vdo *vdo, unsigned int dump_options_requested, const char *why) @@ -90,52 +62,36 @@ static void do_dump(struct vdo *vdo, uint32_t active, maximum; int64_t outstanding; - uds_log_info("%s dump triggered via %s", THIS_MODULE->name, why); - // XXX Add in number of outstanding requests being processed by vdo - - active = READ_ONCE(vdo->request_limiter.active); - maximum = READ_ONCE(vdo->request_limiter.maximum); - + uds_log_info("%s dump triggered via %s", UDS_LOGGING_MODULE_NAME, why); + active = get_data_vio_pool_active_requests(vdo->data_vio_pool); + maximum = get_data_vio_pool_maximum_requests(vdo->data_vio_pool); outstanding = (atomic64_read(&vdo->stats.bios_submitted) - atomic64_read(&vdo->stats.bios_completed)); uds_log_info("%u device requests outstanding (max %u), %lld bio requests outstanding, device '%s'", active, maximum, outstanding, - get_vdo_device_name(vdo->device_config->owning_target)); - if ((dump_options_requested & FLAG_SHOW_REQUEST_QUEUE) != 0) { - dump_vdo_work_queue(vdo); - } - - if ((dump_options_requested & FLAG_SHOW_BIO_QUEUE) != 0) { - vdo_dump_bio_work_queue(vdo->io_submitter); - } + vdo_get_device_name(vdo->device_config->owning_target)); + if (((dump_options_requested & FLAG_SHOW_QUEUES) != 0) + && (vdo->threads != NULL)) { + thread_id_t id; - if (use_bio_ack_queue(vdo) && - ((dump_options_requested & FLAG_SHOW_BIO_ACK_QUEUE) != 0)) { - dump_work_queue(vdo->bio_ack_queue); - } - - if ((dump_options_requested & FLAG_SHOW_CPU_QUEUES) != 0) { - dump_work_queue(vdo->cpu_queue); + for (id = 0; id < vdo->thread_config->thread_count; id++) { + dump_work_queue(vdo->threads[id].queue); + } } - dump_vdo_dedupe_index(vdo->dedupe_index, - (dump_options_requested & FLAG_SHOW_INDEX_QUEUE) != - 0); - dump_buffer_pool(vdo->data_vio_pool, - (dump_options_requested & FLAG_SHOW_VIO_POOL) != 0); + vdo_dump_dedupe_index(vdo->dedupe_index); + dump_data_vio_pool(vdo->data_vio_pool, + (dump_options_requested & FLAG_SHOW_VIO_POOL) != 0); if ((dump_options_requested & FLAG_SHOW_VDO_STATUS) != 0) { - // Options should become more fine-grained when we have more to - // display here. - dump_vdo_status(vdo); + vdo_dump_status(vdo); } report_uds_memory_usage(); - uds_log_info("end of %s dump", THIS_MODULE->name); + uds_log_info("end of %s dump", UDS_LOGGING_MODULE_NAME); } -/**********************************************************************/ static int parse_dump_options(unsigned int argc, char *const *argv, unsigned int *dump_options_requested_ptr) @@ -146,29 +102,11 @@ static int parse_dump_options(unsigned int argc, const char *name; unsigned int flags; } option_names[] = { - { "bioack", FLAG_SKIP_DEFAULT | FLAG_SHOW_BIO_ACK_QUEUE }, - { "kvdobioackq", FLAG_SKIP_DEFAULT | FLAG_SHOW_BIO_ACK_QUEUE }, - { "bioackq", FLAG_SKIP_DEFAULT | FLAG_SHOW_BIO_ACK_QUEUE }, - { "bio", FLAG_SKIP_DEFAULT | FLAG_SHOW_BIO_QUEUE }, - { "kvdobioq", FLAG_SKIP_DEFAULT | FLAG_SHOW_BIO_QUEUE }, - { "bioq", FLAG_SKIP_DEFAULT | FLAG_SHOW_BIO_QUEUE }, - { "cpu", FLAG_SKIP_DEFAULT | FLAG_SHOW_CPU_QUEUES }, - { "kvdocpuq", FLAG_SKIP_DEFAULT | FLAG_SHOW_CPU_QUEUES }, - { "cpuq", FLAG_SKIP_DEFAULT | FLAG_SHOW_CPU_QUEUES }, - // Should "index" mean sending queue + receiving thread + - // outstanding? - { "dedupe", FLAG_SKIP_DEFAULT | FLAG_SHOW_INDEX_QUEUE }, - { "dedupeq", FLAG_SKIP_DEFAULT | FLAG_SHOW_INDEX_QUEUE }, - { "kvdodedupeq", FLAG_SKIP_DEFAULT | FLAG_SHOW_INDEX_QUEUE }, - { "request", FLAG_SKIP_DEFAULT | FLAG_SHOW_REQUEST_QUEUE }, - { "kvdoreqq", FLAG_SKIP_DEFAULT | FLAG_SHOW_REQUEST_QUEUE }, - { "reqq", FLAG_SKIP_DEFAULT | FLAG_SHOW_REQUEST_QUEUE }, { "viopool", FLAG_SKIP_DEFAULT | FLAG_SHOW_VIO_POOL }, { "vdo", FLAG_SKIP_DEFAULT | FLAG_SHOW_VDO_STATUS }, - { "pools", FLAG_SKIP_DEFAULT | FLAGS_ALL_POOLS }, - { "queues", FLAG_SKIP_DEFAULT | FLAGS_ALL_QUEUES }, - { "threads", FLAG_SKIP_DEFAULT | FLAGS_ALL_THREADS }, + { "queues", FLAG_SKIP_DEFAULT | FLAG_SHOW_QUEUES }, + { "threads", FLAG_SKIP_DEFAULT | FLAG_SHOW_QUEUES }, { "default", FLAG_SKIP_DEFAULT | DEFAULT_DUMP_FLAGS }, { "all", ~0 }, }; @@ -179,14 +117,14 @@ static int parse_dump_options(unsigned int argc, for (i = 1; i < argc; i++) { int j; - for (j = 0; j < COUNT_OF(option_names); j++) { + for (j = 0; j < ARRAY_SIZE(option_names); j++) { if (is_arg_string(argv[i], option_names[j].name)) { dump_options_requested |= option_names[j].flags; break; } } - if (j == COUNT_OF(option_names)) { + if (j == ARRAY_SIZE(option_names)) { uds_log_warning("dump option name '%s' unknown", argv[i]); options_okay = false; @@ -202,7 +140,9 @@ static int parse_dump_options(unsigned int argc, return 0; } -/**********************************************************************/ +/* + * Dump as specified by zero or more string arguments. + */ int vdo_dump(struct vdo *vdo, unsigned int argc, char *const *argv, @@ -219,8 +159,151 @@ int vdo_dump(struct vdo *vdo, return 0; } -/**********************************************************************/ +/* + * Dump everything we know how to dump + */ void vdo_dump_all(struct vdo *vdo, const char *why) { do_dump(vdo, ~0, why); } + +/* + * Dump out the data_vio waiters on a wait queue. + * wait_on should be the label to print for queue (e.g. logical or physical) + */ +static void dump_vio_waiters(struct wait_queue *queue, char *wait_on) +{ + struct waiter *waiter, *first = get_first_waiter(queue); + struct data_vio *data_vio; + + if (first == NULL) { + return; + } + + data_vio = waiter_as_data_vio(first); + + uds_log_info(" %s is locked. Waited on by: vio %px pbn %llu lbn %llu d-pbn %llu lastOp %s", + wait_on, data_vio, get_data_vio_allocation(data_vio), + data_vio->logical.lbn, data_vio->duplicate.pbn, + get_data_vio_operation_name(data_vio)); + + + for (waiter = first->next_waiter; waiter != first; + waiter = waiter->next_waiter) { + data_vio = waiter_as_data_vio(waiter); + uds_log_info(" ... and : vio %px pbn %llu lbn %llu d-pbn %llu lastOp %s", + data_vio, get_data_vio_allocation(data_vio), + data_vio->logical.lbn, data_vio->duplicate.pbn, + get_data_vio_operation_name(data_vio)); + } +} + +/* + * Encode various attributes of a data_vio as a string of one-character flags. + * This encoding is for logging brevity: + * + * R => vio completion result not VDO_SUCCESS + * W => vio is on a wait queue + * D => vio is a duplicate + * + * The common case of no flags set will result in an empty, null-terminated + * buffer. If any flags are encoded, the first character in the string will be + * a space character. + */ +static void encode_vio_dump_flags(struct data_vio *data_vio, char buffer[8]) +{ + char *p_flag = buffer; + *p_flag++ = ' '; + if (data_vio_as_completion(data_vio)->result != VDO_SUCCESS) { + *p_flag++ = 'R'; + } + if (data_vio->waiter.next_waiter != NULL) { + *p_flag++ = 'W'; + } + if (data_vio->is_duplicate) { + *p_flag++ = 'D'; + } + if (p_flag == &buffer[1]) { + /* No flags, so remove the blank space. */ + p_flag = buffer; + } + *p_flag = '\0'; +} + +/* + * Implements buffer_dump_function. + */ +void dump_data_vio(void *data) +{ + struct data_vio *data_vio = (struct data_vio *) data; + + /* + * This just needs to be big enough to hold a queue (thread) name + * and a function name (plus a separator character and NUL). The + * latter is limited only by taste. + * + * In making this static, we're assuming only one "dump" will run at + * a time. If more than one does run, the log output will be garbled + * anyway. + */ + static char vio_completion_dump_buffer[100 + MAX_VDO_WORK_QUEUE_NAME_LEN]; + /* + * Another static buffer... + * log10(256) = 2.408+, round up: + */ + enum { DIGITS_PER_UINT64_T = (int) (1 + 2.41 * sizeof(uint64_t)) }; + static char vio_block_number_dump_buffer[sizeof("P L D") + + 3 * DIGITS_PER_UINT64_T]; + static char vio_flush_generation_buffer[sizeof(" FG") + + DIGITS_PER_UINT64_T] = ""; + static char flags_dump_buffer[8]; + + /* + * We're likely to be logging a couple thousand of these lines, and + * in some circumstances syslogd may have trouble keeping up, so + * keep it BRIEF rather than user-friendly. + */ + dump_completion_to_buffer(data_vio_as_completion(data_vio), + vio_completion_dump_buffer, + sizeof(vio_completion_dump_buffer)); + if (data_vio->is_duplicate) { + snprintf(vio_block_number_dump_buffer, + sizeof(vio_block_number_dump_buffer), + "P%llu L%llu D%llu", + get_data_vio_allocation(data_vio), + data_vio->logical.lbn, + data_vio->duplicate.pbn); + } else if (data_vio_has_allocation(data_vio)) { + snprintf(vio_block_number_dump_buffer, + sizeof(vio_block_number_dump_buffer), + "P%llu L%llu", + get_data_vio_allocation(data_vio), + data_vio->logical.lbn); + } else { + snprintf(vio_block_number_dump_buffer, + sizeof(vio_block_number_dump_buffer), "L%llu", + data_vio->logical.lbn); + } + + if (data_vio->flush_generation != 0) { + snprintf(vio_flush_generation_buffer, + sizeof(vio_flush_generation_buffer), " FG%llu", + data_vio->flush_generation); + } + + encode_vio_dump_flags(data_vio, flags_dump_buffer); + + uds_log_info(" vio %px %s%s %s %s%s", data_vio, + vio_block_number_dump_buffer, vio_flush_generation_buffer, + get_data_vio_operation_name(data_vio), + vio_completion_dump_buffer, + flags_dump_buffer); + /* + * might want info on: wantUDSAnswer / operation / status + * might want info on: bio / bios_merged + */ + + dump_vio_waiters(&data_vio->logical.waiters, "lbn"); + + /* might want to dump more info from vio here */ +} diff --git a/vdo/dump.h b/vdo/dump.h index aedbaf02..9816dff1 100644 --- a/vdo/dump.h +++ b/vdo/dump.h @@ -1,50 +1,20 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/kernel/dump.h#9 $ */ #ifndef DUMP_H #define DUMP_H -#include "kernelLayer.h" +#include "kernel-types.h" -/** - * Dump internal state and/or statistics to the kernel log, as specified by - * zero or more string arguments. - * - * @param vdo The vdo - * @param argc Number of arguments - * @param argv The argument list - * @param why Reason for doing the dump - **/ int vdo_dump(struct vdo *vdo, unsigned int argc, char *const *argv, const char *why); -/** - * Dump lots of internal state and statistics to the kernel log. Identical to - * "dump all", without each caller needing to set up the argument list. - * - * @param vdo The vdo - * @param why Reason for doing the dump - **/ void vdo_dump_all(struct vdo *vdo, const char *why); -#endif // DUMP_H +void dump_data_vio(void *data); + +#endif /* DUMP_H */ diff --git a/uds/errors.c b/vdo/errors.c similarity index 60% rename from uds/errors.c rename to vdo/errors.c index 023f6762..a7b308fa 100644 --- a/uds/errors.c +++ b/vdo/errors.c @@ -1,29 +1,13 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/krusty/src/uds/errors.c#27 $ */ #include "errors.h" #include "common.h" #include "permassert.h" -#include "stringUtils.h" +#include "string-utils.h" #include @@ -78,17 +62,15 @@ static const struct error_info error_list[] = { { "UDS_QUEUED", "Request queued" }, { "UDS_BUFFER_ERROR", "Buffer error" }, { "UDS_NO_DIRECTORY", "Expected directory is missing" }, - { "UDS_CHECKPOINT_INCOMPLETE", "Checkpoint not completed" }, { "UDS_ALREADY_REGISTERED", "Error range already registered" }, { "UDS_BAD_IO_DIRECTION", "Bad I/O direction" }, { "UDS_INCORRECT_ALIGNMENT", "Offset not at block alignment" }, { "UDS_OUT_OF_RANGE", "Cannot access data outside specified limits" }, { "UDS_EMODULE_LOAD", "Could not load modules" }, { "UDS_DISABLED", "UDS library context is disabled" }, - { "UDS_CORRUPT_COMPONENT", "Corrupt saved component" }, { "UDS_UNKNOWN_ERROR", "Unknown error" }, { "UDS_UNSUPPORTED_VERSION", "Unsupported version" }, - { "UDS_CORRUPT_DATA", "Index data in memory is corrupt" }, + { "UDS_CORRUPT_DATA", "Some index structure is corrupt" }, { "UDS_SHORT_READ", "Could not read requested number of bytes" }, { "UDS_RESOURCE_LIMIT_EXCEEDED", "Internal resource limits exceeded" }, { "UDS_VOLUME_OVERFLOW", "Memory overflow due to storage failure" }, @@ -106,10 +88,10 @@ struct error_block { }; enum { - MAX_ERROR_BLOCKS = 6 // needed for testing + MAX_ERROR_BLOCKS = 6, }; -static struct error_information { +static struct { int allocated; int count; struct error_block blocks[MAX_ERROR_BLOCKS]; @@ -117,32 +99,25 @@ static struct error_information { .allocated = MAX_ERROR_BLOCKS, .count = 1, .blocks = { { - .name = "UDS Error", - .base = UDS_ERROR_CODE_BASE, - .last = UDS_ERROR_CODE_LAST, - .max = UDS_ERROR_CODE_BLOCK_END, - .infos = error_list, - } } + .name = "UDS Error", + .base = UDS_ERROR_CODE_BASE, + .last = UDS_ERROR_CODE_LAST, + .max = UDS_ERROR_CODE_BLOCK_END, + .infos = error_list, + } }, }; -/** - * Fetch the error info (if any) for the error number. - * - * @param errnum the error number - * @param info_ptr the place to store the info for this error (if known), - * otherwise set to NULL - * - * @return the name of the error block (if known), NULL othersise - **/ +/* + * Get the error info for an error number. Also returns the name of the error + * block, if known. + */ static const char *get_error_info(int errnum, const struct error_info **info_ptr) { struct error_block *block; if (errnum == UDS_SUCCESS) { - if (info_ptr != NULL) { - *info_ptr = &successful; - } + *info_ptr = &successful; return NULL; } @@ -150,39 +125,24 @@ static const char *get_error_info(int errnum, block < registered_errors.blocks + registered_errors.count; ++block) { if ((errnum >= block->base) && (errnum < block->last)) { - if (info_ptr != NULL) { - *info_ptr = - block->infos + (errnum - block->base); - } + *info_ptr = block->infos + (errnum - block->base); return block->name; } else if ((errnum >= block->last) && (errnum < block->max)) { - if (info_ptr != NULL) { - *info_ptr = NULL; - } + *info_ptr = NULL; return block->name; } } - if (info_ptr != NULL) { - *info_ptr = NULL; - } + return NULL; } -/** - * Return string describing a system error message - * - * @param errnum System error number - * @param buf Buffer that can be used to contain the return value - * @param buflen Length of the buffer - * - * @return The error string, which may be a string constant or may be - * returned in the buf argument - **/ +/* Return a string describing a system error message. */ static const char *system_string_error(int errnum, char *buf, size_t buflen) { size_t len; const char *error_string = NULL; - if ((errnum > 0) && (errnum < COUNT_OF(message_table))) { + + if ((errnum > 0) && (errnum < ARRAY_SIZE(message_table))) { error_string = message_table[errnum]; } @@ -197,8 +157,8 @@ static const char *system_string_error(int errnum, char *buf, size_t buflen) return "System error"; } -/**********************************************************************/ -const char *string_error(int errnum, char *buf, size_t buflen) +/* Convert an error code to a descriptive string. */ +const char *uds_string_error(int errnum, char *buf, size_t buflen) { char *buffer = buf; char *buf_end = buf + buflen; @@ -214,7 +174,6 @@ const char *string_error(int errnum, char *buf, size_t buflen) } block_name = get_error_info(errnum, &info); - if (block_name != NULL) { if (info != NULL) { buffer = uds_append_to_buffer(buffer, @@ -230,13 +189,17 @@ const char *string_error(int errnum, char *buf, size_t buflen) errnum); } } else if (info != NULL) { - buffer = uds_append_to_buffer(buffer, buf_end, "%s", + buffer = uds_append_to_buffer(buffer, + buf_end, + "%s", info->message); } else { const char *tmp = system_string_error(errnum, buffer, buf_end - buffer); if (tmp != buffer) { - buffer = uds_append_to_buffer(buffer, buf_end, "%s", + buffer = uds_append_to_buffer(buffer, + buf_end, + "%s", tmp); } else { buffer += strlen(tmp); @@ -245,10 +208,9 @@ const char *string_error(int errnum, char *buf, size_t buflen) return buf; } -/**********************************************************************/ -const char *string_error_name(int errnum, char *buf, size_t buflen) +/* Convert an error code to its name. */ +const char *uds_string_error_name(int errnum, char *buf, size_t buflen) { - char *buffer = buf; char *buf_end = buf + buflen; const struct error_info *info = NULL; @@ -257,23 +219,34 @@ const char *string_error_name(int errnum, char *buf, size_t buflen) if (errnum < 0) { errnum = -errnum; } + block_name = get_error_info(errnum, &info); if (block_name != NULL) { if (info != NULL) { - buffer = uds_append_to_buffer(buffer, buf_end, "%s", + buffer = uds_append_to_buffer(buffer, + buf_end, + "%s", info->name); } else { - buffer = uds_append_to_buffer(buffer, buf_end, "%s %d", - block_name, errnum); + buffer = uds_append_to_buffer(buffer, + buf_end, + "%s %d", + block_name, + errnum); } } else if (info != NULL) { - buffer = uds_append_to_buffer(buffer, buf_end, "%s", + buffer = uds_append_to_buffer(buffer, + buf_end, + "%s", info->name); } else { - const char *tmp = - system_string_error(errnum, buffer, buf_end - buffer); + const char *tmp; + + tmp = system_string_error(errnum, buffer, buf_end - buffer); if (tmp != buffer) { - buffer = uds_append_to_buffer(buffer, buf_end, "%s", + buffer = uds_append_to_buffer(buffer, + buf_end, + "%s", tmp); } else { buffer += strlen(tmp); @@ -282,47 +255,53 @@ const char *string_error_name(int errnum, char *buf, size_t buflen) return buf; } -/**********************************************************************/ +/* + * Translate an error code into a value acceptable to the kernel. The input + * error code may be a system-generated value (such as -EIO), or an internal + * UDS status code. The result will be a negative errno value. + */ int uds_map_to_system_error(int error) { - char error_name[80], error_message[ERRBUF_SIZE]; + char error_name[UDS_MAX_ERROR_NAME_SIZE]; + char error_message[UDS_MAX_ERROR_MESSAGE_SIZE]; - // 0 is success, negative a system error code + /* 0 is success, and negative values are already system error codes. */ if (likely(error <= 0)) { return error; } if (error < 1024) { - // probably an errno from userspace, just negate it. + /* This is probably an errno from userspace. */ return -error; } - // UDS error + /* Internal UDS errors */ switch (error) { case UDS_NO_INDEX: - case UDS_CORRUPT_COMPONENT: - // The index doesn't exist or can't be recovered. + case UDS_CORRUPT_DATA: + /* The index doesn't exist or can't be recovered. */ return -ENOENT; case UDS_INDEX_NOT_SAVED_CLEANLY: case UDS_UNSUPPORTED_VERSION: - // The index exists, but can't be loaded. Tell the client it - // exists so they don't destroy it inadvertently. + /* + * The index exists, but can't be loaded. Tell the client it + * exists so they don't destroy it inadvertently. + */ return -EEXIST; case UDS_DISABLED: - // The session is unusable; only returned by requests. + /* The session is unusable; only returned by requests. */ return -EIO; default: - // No other UDS error code is expected here, so log what we - // got and convert to something reasonable. + /* Translate an unexpected error into something generic. */ uds_log_info("%s: mapping status code %d (%s: %s) to -EIO", __func__, error, - string_error_name(error, - error_name, - sizeof(error_name)), + uds_string_error_name(error, + error_name, + sizeof(error_name)), uds_string_error(error, error_message, sizeof(error_message))); @@ -330,22 +309,39 @@ int uds_map_to_system_error(int error) } } -/**********************************************************************/ +/* + * Register a block of error codes. + * + * @param block_name the name of the block of error codes + * @param first_error the first error code in the block + * @param next_free_error one past the highest possible error in the block + * @param infos a pointer to the error info array for the block + * @param info_size the size of the error info array + */ int register_error_block(const char *block_name, int first_error, - int last_reserved_error, + int next_free_error, const struct error_info *infos, size_t info_size) { + int result; struct error_block *block; - int result = ASSERT(first_error < last_reserved_error, - "bad error block range"); + struct error_block new_block = { + .name = block_name, + .base = first_error, + .last = first_error + (info_size / sizeof(struct error_info)), + .max = next_free_error, + .infos = infos, + }; + + result = ASSERT(first_error < next_free_error, + "well-defined error block range"); if (result != UDS_SUCCESS) { return result; } if (registered_errors.count == registered_errors.allocated) { - // could reallocate and grow, but should never happen + /* This should never happen. */ return UDS_OVERFLOW; } @@ -355,21 +351,14 @@ int register_error_block(const char *block_name, if (strcmp(block_name, block->name) == 0) { return UDS_DUPLICATE_NAME; } - // check for overlap in error ranges + + /* Ensure error ranges do not overlap. */ if ((first_error < block->max) && - (last_reserved_error > block->base)) { + (next_free_error > block->base)) { return UDS_ALREADY_REGISTERED; } } - registered_errors.blocks[registered_errors.count++] = - (struct error_block){ .name = block_name, - .base = first_error, - .last = first_error + - (info_size / - sizeof(struct error_info)), - .max = last_reserved_error, - .infos = infos }; - + registered_errors.blocks[registered_errors.count++] = new_block; return UDS_SUCCESS; } diff --git a/vdo/errors.h b/vdo/errors.h new file mode 100644 index 00000000..d3807cbc --- /dev/null +++ b/vdo/errors.h @@ -0,0 +1,97 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright Red Hat + */ + +#ifndef ERRORS_H +#define ERRORS_H + +#include "compiler.h" +#include "type-defs.h" + +/* Valid status codes for internal UDS functions. */ +enum uds_status_codes { + /* Successful return */ + UDS_SUCCESS = 0, + + /* Used as a base value for reporting internal errors */ + UDS_ERROR_CODE_BASE = 1024, + /* Index overflow */ + UDS_OVERFLOW = UDS_ERROR_CODE_BASE + 0, + /* Invalid argument passed to internal routine */ + UDS_INVALID_ARGUMENT = UDS_ERROR_CODE_BASE + 1, + /* UDS data structures are in an invalid state */ + UDS_BAD_STATE = UDS_ERROR_CODE_BASE + 2, + /* Attempt to enter the same name into an internal structure twice */ + UDS_DUPLICATE_NAME = UDS_ERROR_CODE_BASE + 3, + /* An internal protocol violation between system components */ + UDS_UNEXPECTED_RESULT = UDS_ERROR_CODE_BASE + 4, + /* An assertion failed */ + UDS_ASSERTION_FAILED = UDS_ERROR_CODE_BASE + 5, + /* A request has been queued for later processing (not an error) */ + UDS_QUEUED = UDS_ERROR_CODE_BASE + 6, + /* A problem has occured with a buffer */ + UDS_BUFFER_ERROR = UDS_ERROR_CODE_BASE + 7, + /* No directory was found where one was expected */ + UDS_NO_DIRECTORY = UDS_ERROR_CODE_BASE + 8, + /* This error range has already been registered */ + UDS_ALREADY_REGISTERED = UDS_ERROR_CODE_BASE + 9, + /* Either read-only or write-only */ + UDS_BAD_IO_DIRECTION = UDS_ERROR_CODE_BASE + 10, + /* Cannot do I/O at this offset */ + UDS_INCORRECT_ALIGNMENT = UDS_ERROR_CODE_BASE + 11, + /* Attempt to read or write data outside the valid range */ + UDS_OUT_OF_RANGE = UDS_ERROR_CODE_BASE + 12, + /* Could not load modules */ + UDS_EMODULE_LOAD = UDS_ERROR_CODE_BASE + 13, + /* The index session is disabled */ + UDS_DISABLED = UDS_ERROR_CODE_BASE + 14, + /* Unknown error */ + UDS_UNKNOWN_ERROR = UDS_ERROR_CODE_BASE + 15, + /* The index configuration or volume format is no longer supported */ + UDS_UNSUPPORTED_VERSION = UDS_ERROR_CODE_BASE + 16, + /* Some index structure is corrupt */ + UDS_CORRUPT_DATA = UDS_ERROR_CODE_BASE + 17, + /* Short read due to truncated file */ + UDS_SHORT_READ = UDS_ERROR_CODE_BASE + 18, + /* Internal resource limits exceeded */ + UDS_RESOURCE_LIMIT_EXCEEDED = UDS_ERROR_CODE_BASE + 19, + /* Memory overflow due to storage failure */ + UDS_VOLUME_OVERFLOW = UDS_ERROR_CODE_BASE + 20, + /* No index state found */ + UDS_NO_INDEX = UDS_ERROR_CODE_BASE + 21, + /* Premature end of file in scanned file */ + UDS_END_OF_FILE = UDS_ERROR_CODE_BASE + 22, + /* Attempt to access incomplete index save data */ + UDS_INDEX_NOT_SAVED_CLEANLY = UDS_ERROR_CODE_BASE + 23, + /* One more than the last UDS_INTERNAL error code */ + UDS_ERROR_CODE_LAST, + /* One more than the last error this block will ever use */ + UDS_ERROR_CODE_BLOCK_END = UDS_ERROR_CODE_BASE + 440, +}; + +enum { + UDS_MAX_ERROR_NAME_SIZE = 80, + UDS_MAX_ERROR_MESSAGE_SIZE = 128, +}; + +struct error_info { + const char *name; + const char *message; +}; + +const char * __must_check uds_string_error(int errnum, + char *buf, + size_t buflen); + +const char *uds_string_error_name(int errnum, char *buf, size_t buflen); + +int uds_map_to_system_error(int error); + +int register_error_block(const char *block_name, + int first_error, + int last_reserved_error, + const struct error_info *infos, + size_t info_size); + +#endif /* ERRORS_H */ diff --git a/vdo/event-count.c b/vdo/event-count.c new file mode 100644 index 00000000..67b03422 --- /dev/null +++ b/vdo/event-count.c @@ -0,0 +1,374 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright Red Hat + */ + +/** + * An event count is a lock-free equivalent of a condition variable. + * + * Using an event count, a lock-free producer/consumer can wait for a state + * change (adding an item to an empty queue, for example) without spinning or + * falling back on the use of mutex-based locks. Signalling is cheap when + * there are no waiters (a memory fence), and preparing to wait is also + * inexpensive (an atomic add instruction). + * + * A lock-free producer should call event_count_broadcast() after any mutation + * to the lock-free data structure that a consumer might be waiting on. The + * consumers should poll for work like this: + * + * for (;;) { + * // Fast path--no additional cost to consumer. + * if (lockfree_dequeue(&item)) { + * return item; + * } + * // Two-step wait: get current token and poll state, either cancelling + * // the wait or waiting for the token to be signalled. + * event_token_t token = event_count_prepare(event_count); + * if (lockfree_dequeue(&item)) { + * event_count_cancel(event_count, token); + * return item; + * } + * event_count_wait(event_count, token, NULL); + * // State has changed, but must check condition again, so loop. + * } + * + * Once event_count_prepare() is called, the caller should neither sleep nor + * perform long-running or blocking actions before passing the token to + * event_count_cancel() or event_count_wait(). The implementation is optimized + * for a short polling window, and will not perform well if there are + * outstanding tokens that have been signalled but not waited upon. + * + * This event count implementation uses a posix semaphore for portability, + * although a futex would be slightly superior to use and easy to substitute. + * It is designed to make signalling as cheap as possible, since that is the + * code path likely triggered on most updates to a lock-free data structure. + * Waiters are likely going to sleep, so optimizing for that case isn't + * necessary. + * + * The critical field is the state, which is really two fields that can be + * atomically updated in unison: an event counter and a waiter count. Every + * call to event_count_prepare() issues a wait token by atomically incrementing + * the waiter count. The key invariant is a strict accounting of the number of + * tokens issued. Every token returned by event_count_prepare() is a contract + * that the caller will call uds_acquire_semaphore() and a signaller will call + * uds_release_semaphore(), each exactly once. Atomic updates to the state + * field ensure that each token is counted once and that tokens are not lost. + * Cancelling a token attempts to take a fast-path by simply decrementing the + * waiters field, but if the token has already been claimed by a signaller, the + * canceller must still wait on the semaphore to consume the transferred token. + * + * The state field is 64 bits, partitioned into a 16-bit waiter field and a + * 48-bit counter. We are unlikely to have 2^16 threads, much less 2^16 threads + * waiting on any single event transition. 2^48 microseconds is several years, + * so a token holder would have to wait that long for the counter to wrap + * around, and then call event_count_wait() at the exact right time to see the + * re-used counter, in order to lose a wakeup due to counter wrap-around. Using + * a 32-bit state field would greatly increase that chance, but if forced to do + * so, the implementation could likely tolerate it since callers are supposed + * to hold tokens for miniscule periods of time. Fortunately, x64 has 64-bit + * compare-and-swap, and the performance of interlocked 64-bit operations + * appears to be about the same as for 32-bit ones, so being paranoid and using + * 64 bits costs us nothing. + * + * Here are some sequences of calls and state transitions: + * + * action postcondition + * counter waiters semaphore + * initialized 0 0 0 + * prepare 0 1 0 + * wait (blocks) 0 1 0 + * signal 1 0 1 + * wait (unblocks) 1 0 0 + * + * signal (fast-path) 1 0 0 + * signal (fast-path) 1 0 0 + * + * prepare A 1 1 0 + * prepare B 1 2 0 + * signal 2 0 2 + * wait B (fast-path) 2 0 1 + * wait A (fast-path) 2 0 0 + * + * prepare 2 1 0 + * cancel (fast-path) 2 0 0 + * + * prepare 2 1 0 + * signal 3 0 1 + * cancel (must wait) 3 0 0 + * + * The event count structure is aligned, sized, and allocated to cache line + * boundaries to avoid any false sharing between the event count and other + * shared state. The state field and semaphore should fit on a single cache + * line. The instrumentation counters increase the size of the structure so it + * rounds up to use two (64-byte x86) cache lines. + **/ + +#include "event-count.h" + +#include + +#include "common.h" +#include "compiler.h" +#include "cpu.h" +#include "logger.h" +#include "memory-alloc.h" +#include "uds-threads.h" + +enum { + /* value used to increment the waiters field */ + ONE_WAITER = 1, + /* value used to increment the event counter */ + ONE_EVENT = (1 << 16), + /* bit mask to access the waiters field */ + WAITERS_MASK = (ONE_EVENT - 1), + /* bit mask to access the event counter */ + EVENTS_MASK = ~WAITERS_MASK, +}; + +struct event_count { + /* + * Atomically mutable state: + * low 16 bits: the number of wait tokens not posted to the semaphore + * high 48 bits: current event counter + */ + atomic64_t state; + + /* Semaphore used to block threads when waiting is required. */ + struct semaphore semaphore; + + /* Declare alignment so we don't share a cache line. */ +} __attribute__((aligned(CACHE_LINE_BYTES))); + +static INLINE bool same_event(event_token_t token1, event_token_t token2) +{ + return (token1 & EVENTS_MASK) == (token2 & EVENTS_MASK); +} + +/* Wake all threads that are waiting for the next event. */ +void event_count_broadcast(struct event_count *count) +{ + uint64_t waiters; + uint64_t state; + uint64_t old_state; + + /* Even if there are no waiters (yet), we will need a memory barrier. */ + smp_mb(); + + state = old_state = atomic64_read(&count->state); + do { + event_token_t new_state; + + /* + * Check if there are any tokens that have not yet been been + * transferred to the semaphore. This is the fast no-waiters + * path. + */ + waiters = (state & WAITERS_MASK); + if (waiters == 0) { + /* + * Fast path first time through -- no need to signal or + * post if there are no observers. + */ + return; + } + + /* + * Attempt to atomically claim all the wait tokens and bump the + * event count using an atomic compare-and-swap. This operation + * contains a memory barrier. + */ + new_state = ((state & ~WAITERS_MASK) + ONE_EVENT); + old_state = state; + state = atomic64_cmpxchg(&count->state, old_state, new_state); + /* + * The cmpxchg fails when we lose a race with a new waiter or + * another signaller, so try again. + */ + } while (unlikely(state != old_state)); + + /* + * Wake the waiters by posting to the semaphore. This effectively + * transfers the wait tokens to the semaphore. There's sadly no bulk + * post for posix semaphores, so we've got to loop to do them all. + */ + while (waiters-- > 0) { + uds_release_semaphore(&count->semaphore); + } +} + +/* + * Attempt to cancel a prepared wait token by decrementing the number of + * waiters in the current state. This can only be done safely if the event + * count hasn't been incremented. Returns true if the wait was successfully + * cancelled. + */ +static INLINE bool fast_cancel(struct event_count *count, event_token_t token) +{ + event_token_t current_token = atomic64_read(&count->state); + event_token_t new_token; + + while (same_event(current_token, token)) { + /* + * Try to decrement the waiter count via compare-and-swap as if + * we had never prepared to wait. + */ + new_token = atomic64_cmpxchg(&count->state, + current_token, + current_token - 1); + if (new_token == current_token) { + return true; + } + + current_token = new_token; + } + + return false; +} + +/* + * Consume a token from the semaphore, waiting (with an optional timeout) if + * one is not currently available. Returns true if a token was consumed. + */ +static bool consume_wait_token(struct event_count *count, + const ktime_t *timeout) +{ + /* Try to grab a token without waiting. */ + if (uds_attempt_semaphore(&count->semaphore, 0)) { + return true; + } + + if (timeout == NULL) { + uds_acquire_semaphore(&count->semaphore); + } else if (!uds_attempt_semaphore(&count->semaphore, *timeout)) { + return false; + } + + return true; +} + +int make_event_count(struct event_count **count_ptr) +{ + /* + * The event count will be allocated on a cache line boundary so there + * will not be false sharing of the line with any other data structure. + */ + int result; + struct event_count *count = NULL; + + result = UDS_ALLOCATE(1, struct event_count, "event count", &count); + if (result != UDS_SUCCESS) { + return result; + } + + atomic64_set(&count->state, 0); + result = uds_initialize_semaphore(&count->semaphore, 0); + if (result != UDS_SUCCESS) { + UDS_FREE(count); + return result; + } + + *count_ptr = count; + return UDS_SUCCESS; +} + +/* Free a struct event_count. It must no longer be in use. */ +void free_event_count(struct event_count *count) +{ + if (count == NULL) { + return; + } + + uds_destroy_semaphore(&count->semaphore); + UDS_FREE(count); +} + +/* + * Prepare to wait for the event count to change by capturing a token of its + * current state. The caller MUST eventually either call event_count_wait() or + * event_count_cancel() exactly once for each token obtained. + */ +event_token_t event_count_prepare(struct event_count *count) +{ + return atomic64_add_return(ONE_WAITER, &count->state); +} + +/* + * Cancel a wait token that has been prepared but not waited upon. This must + * be called after event_count_prepare() when event_count_wait() is not going to + * be invoked on the token. + */ +void event_count_cancel(struct event_count *count, event_token_t token) +{ + /* Decrement the waiter count if the event hasn't been signalled. */ + if (fast_cancel(count, token)) { + return; + } + + /* + * A signaller has already transferred (or promised to transfer) our + * token to the semaphore, so we must consume it from the semaphore by + * waiting. + */ + event_count_wait(count, token, NULL); +} + +/* + * Check if the current event count state corresponds to the provided token, + * and if it is, wait for a signal that the state has changed. If a timeout is + * provided, the wait will terminate after the timeout has elapsed. Timing out + * automatically cancels the wait token, so callers must not attempt to cancel + * the token in this case. The timeout is measured in nanoseconds. This + * function returns true if the state changed, or false if it timed out. + */ +bool event_count_wait(struct event_count *count, + event_token_t token, + const ktime_t *timeout) +{ + for (;;) { + /* + * Wait for a signaller to transfer our wait token to the + * semaphore. + */ + if (!consume_wait_token(count, timeout)) { + /* + * The wait timed out, so we must cancel the token + * instead. Try to decrement the waiter count if the + * event hasn't been signalled. + */ + if (fast_cancel(count, token)) { + return false; + } + /* + * We timed out, but a signaller came in before we + * could cancel the wait. We have no choice but to wait + * for the semaphore to be posted. Since the signaller + * has promised to do it, the wait should be short. The + * timeout and the signal happened at about the same + * time, so either outcome could be returned. It's + * simpler to ignore the timeout. + */ + timeout = NULL; + continue; + } + + /* A wait token has now been consumed from the semaphore. */ + + /* + * Stop waiting if the count has changed since the token was + * acquired. + */ + if (!same_event(token, atomic64_read(&count->state))) { + return true; + } + + /* + * We consumed someone else's wait token. Put it back in the + * semaphore, which will wake another waiter, hopefully one who + * can stop waiting. + */ + uds_release_semaphore(&count->semaphore); + + /* Attempt to give an earlier waiter a shot at the semaphore. */ + uds_yield_scheduler(); + } +} diff --git a/vdo/event-count.h b/vdo/event-count.h new file mode 100644 index 00000000..0749cefe --- /dev/null +++ b/vdo/event-count.h @@ -0,0 +1,30 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright Red Hat + */ + +#ifndef EVENT_COUNT_H +#define EVENT_COUNT_H + +#include "time-utils.h" +#include "type-defs.h" + +struct event_count; + +typedef unsigned int event_token_t; + +int __must_check make_event_count(struct event_count **count_ptr); + +void free_event_count(struct event_count *count); + +void event_count_broadcast(struct event_count *count); + +event_token_t __must_check event_count_prepare(struct event_count *count); + +void event_count_cancel(struct event_count *count, event_token_t token); + +bool event_count_wait(struct event_count *count, + event_token_t token, + const ktime_t *timeout); + +#endif /* EVENT_COUNT_H */ diff --git a/vdo/extent.c b/vdo/extent.c deleted file mode 100644 index c578c470..00000000 --- a/vdo/extent.c +++ /dev/null @@ -1,153 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/extent.c#15 $ - */ - -#include "extent.h" - -#include "memoryAlloc.h" -#include "permassert.h" - -#include "completion.h" -#include "constants.h" -#include "logger.h" -#include "types.h" -#include "vdo.h" -#include "vioRead.h" -#include "vioWrite.h" - -/**********************************************************************/ -int create_vdo_extent(struct vdo *vdo, - enum vio_type vio_type, - enum vio_priority priority, - block_count_t block_count, - char *data, - struct vdo_extent **extent_ptr) -{ - struct vdo_extent *extent; - int result = ASSERT(is_vdo_metadata_vio_type(vio_type), - "create_vdo_extent() called for metadata"); - if (result != VDO_SUCCESS) { - return result; - } - - result = UDS_ALLOCATE_EXTENDED(struct vdo_extent, block_count, - struct vio *, __func__, &extent); - if (result != VDO_SUCCESS) { - return result; - } - - initialize_vdo_completion(&extent->completion, vdo, - VDO_EXTENT_COMPLETION); - - for (; extent->count < block_count; extent->count++) { - result = create_metadata_vio(vdo, - vio_type, - priority, - &extent->completion, - data, - &extent->vios[extent->count]); - if (result != VDO_SUCCESS) { - free_vdo_extent(UDS_FORGET(extent)); - return result; - } - - data += VDO_BLOCK_SIZE; - } - - *extent_ptr = extent; - return VDO_SUCCESS; -} - -/**********************************************************************/ -void free_vdo_extent(struct vdo_extent *extent) -{ - block_count_t i; - if (extent == NULL) { - return; - } - - for (i = 0; i < extent->count; i++) { - free_vio(UDS_FORGET(extent->vios[i])); - } - - UDS_FREE(UDS_FORGET(extent)); -} - -/** - * Launch a metadata extent. - * - * @param extent The extent - * @param start_block The absolute physical block at which the extent should - * begin its I/O - * @param count The number of blocks to write - * @param operation The operation to perform on the extent - **/ -static void launch_metadata_extent(struct vdo_extent *extent, - physical_block_number_t start_block, - block_count_t count, - enum vio_operation operation) -{ - block_count_t i; - - reset_vdo_completion(&extent->completion); - if (count > extent->count) { - finish_vdo_completion(&extent->completion, VDO_OUT_OF_RANGE); - return; - } - - extent->complete_count = extent->count - count; - for (i = 0; i < count; i++) { - struct vio *vio = extent->vios[i]; - vio->completion.callback_thread_id = - extent->completion.callback_thread_id; - launch_metadata_vio(vio, start_block++, handle_vio_completion, - handle_vio_completion, operation); - } -} - -/**********************************************************************/ -void read_partial_vdo_metadata_extent(struct vdo_extent *extent, - physical_block_number_t start_block, - block_count_t count) -{ - launch_metadata_extent(extent, start_block, count, VIO_READ); -} - -/**********************************************************************/ -void write_partial_vdo_metadata_extent(struct vdo_extent *extent, - physical_block_number_t start_block, - block_count_t count) -{ - launch_metadata_extent(extent, start_block, count, VIO_WRITE); -} - -/**********************************************************************/ -void handle_vio_completion(struct vdo_completion *completion) -{ - struct vdo_extent *extent = vdo_completion_as_extent(completion->parent); - if (++extent->complete_count != extent->count) { - set_vdo_completion_result(vdo_extent_as_completion(extent), - completion->result); - return; - } - - finish_vdo_completion(vdo_extent_as_completion(extent), - completion->result); -} diff --git a/vdo/extent.h b/vdo/extent.h deleted file mode 100644 index d9dd09e0..00000000 --- a/vdo/extent.h +++ /dev/null @@ -1,160 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/extent.h#9 $ - */ - -#ifndef EXTENT_H -#define EXTENT_H - -#include "completion.h" -#include "types.h" -#include "vio.h" - -/** - * A chain of vios which are part of the same request. An extent contains - * a chain of at least 'count' vios. The 'next' pointer of the last vio - * in the extent (as indicated by the count) may not be NULL, but it is not - * part of the extent. A vio may belong to a single extent. - **/ -struct vdo_extent { - // The completion for asynchronous extent processing - struct vdo_completion completion; - // The number of vios in the extent - block_count_t count; - // The number of completed vios in the extent - block_count_t complete_count; - // The vios in the extent - struct vio *vios[]; -}; - -/** - * Convert a generic vdo_completion to a vdo_extent. - * - * @param completion The completion to convert - * - * @return The completion as an extent - **/ -static inline struct vdo_extent * -vdo_completion_as_extent(struct vdo_completion *completion) -{ - assert_vdo_completion_type(completion->type, VDO_EXTENT_COMPLETION); - return container_of(completion, struct vdo_extent, completion); -} - -/** - * Convert a vdo_extent to a vdo_completion. - * - * @param extent The extent to convert - * - * @return The extent as a vdo_completion - **/ -static inline struct vdo_completion * -vdo_extent_as_completion(struct vdo_extent *extent) -{ - return &extent->completion; -} - -/** - * Create vdo_extent. - * - * @param [in] vdo The VDO - * @param [in] vio_type The usage type to assign to the vios in the extent - * (data / block map / journal) - * @param [in] priority The relative priority to assign to the vios - * @param [in] block_count The number of blocks in the buffer - * @param [in] data The buffer - * @param [out] extent_ptr A pointer to hold the new extent - * - * @return VDO_SUCCESS or an error - **/ -int __must_check create_vdo_extent(struct vdo *vdo, - enum vio_type vio_type, - enum vio_priority priority, - block_count_t block_count, - char *data, - struct vdo_extent **extent_ptr); - -/** - * Free an extent. - * - * @param extent The extent to free - **/ -void free_vdo_extent(struct vdo_extent *extent); - -/** - * Read metadata from the underlying storage. - * - * @param extent The extent to read - * @param start_block The physical block number of the first block - * in the extent - * @param count The number of blocks to read (must be less than or - * equal to the length of the extent) - **/ -void read_partial_vdo_metadata_extent(struct vdo_extent *extent, - physical_block_number_t start_block, - block_count_t count); - -/** - * Read metadata from the underlying storage. - * - * @param extent The extent to read - * @param start_block The physical block number of the first block - * in the extent - **/ -static inline void read_vdo_metadata_extent(struct vdo_extent *extent, - physical_block_number_t start_block) -{ - read_partial_vdo_metadata_extent(extent, start_block, extent->count); -} - -/** - * Write metadata to the underlying storage. - * - * @param extent The extent to write - * @param start_block The physical block number of the first block in the - * extent - * @param count The number of blocks to read (must be less than or - * equal to the length of the extent) - **/ -void write_partial_vdo_metadata_extent(struct vdo_extent *extent, - physical_block_number_t start_block, - block_count_t count); -/** - * Write metadata to the underlying storage. - * - * @param extent The extent to write - * @param start_block The physical block number of the first block in the - * extent - **/ -static inline void write_vdo_metadata_extent(struct vdo_extent *extent, - physical_block_number_t start_block) -{ - write_partial_vdo_metadata_extent(extent, start_block, extent->count); -} - -/** - * Notify an extent that one of its vios has completed. If the signaling vio - * is the last of the extent's vios to complete, the extent will finish. This - * function is set as the vio callback in launch_metadata_extent(). - * - * @param completion The completion of the vio which has just finished - **/ -void handle_vio_completion(struct vdo_completion *completion); - -#endif /* EXTENT_H */ diff --git a/vdo/fixedLayout.c b/vdo/fixedLayout.c deleted file mode 100644 index b3047166..00000000 --- a/vdo/fixedLayout.c +++ /dev/null @@ -1,634 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/fixedLayout.c#18 $ - */ - -#include "fixedLayout.h" - -#include "buffer.h" -#include "logger.h" -#include "memoryAlloc.h" -#include "permassert.h" - -#include "header.h" -#include "statusCodes.h" - -const block_count_t VDO_ALL_FREE_BLOCKS = (uint64_t) -1; - -struct fixed_layout { - physical_block_number_t first_free; - physical_block_number_t last_free; - size_t num_partitions; - struct partition *head; -}; - -struct partition { - enum partition_id id; // The id of this partition - struct fixed_layout *layout; // The layout to which this partition - // belongs - physical_block_number_t offset; // The offset into the layout of this - // partition - physical_block_number_t base; // The untranslated number of the first block - block_count_t count; // The number of blocks in the partition - struct partition *next; // A pointer to the next partition in the layout -}; - -struct layout_3_0 { - physical_block_number_t first_free; - physical_block_number_t last_free; - byte partition_count; -} __packed; - -struct partition_3_0 { - enum partition_id id; - physical_block_number_t offset; - physical_block_number_t base; - block_count_t count; -} __packed; - -static const struct header LAYOUT_HEADER_3_0 = { - .id = VDO_FIXED_LAYOUT, - .version = { - .major_version = 3, - .minor_version = 0, - }, - .size = sizeof(struct layout_3_0), // Minimum size - // (contains no partitions) -}; - -/**********************************************************************/ -int make_vdo_fixed_layout(block_count_t total_blocks, - physical_block_number_t start_offset, - struct fixed_layout **layout_ptr) -{ - struct fixed_layout *layout; - int result = UDS_ALLOCATE(1, struct fixed_layout, "fixed layout", &layout); - if (result != UDS_SUCCESS) { - return result; - } - - layout->first_free = start_offset; - layout->last_free = start_offset + total_blocks; - layout->num_partitions = 0; - layout->head = NULL; - - *layout_ptr = layout; - return VDO_SUCCESS; -} - -/**********************************************************************/ -void free_vdo_fixed_layout(struct fixed_layout *layout) -{ - if (layout == NULL) { - return; - } - - while (layout->head != NULL) { - struct partition *part = layout->head; - layout->head = part->next; - UDS_FREE(part); - } - - UDS_FREE(layout); -} - -/**********************************************************************/ -block_count_t get_total_vdo_fixed_layout_size(const struct fixed_layout *layout) -{ - block_count_t size = get_vdo_fixed_layout_blocks_available(layout); - struct partition *partition; - for (partition = layout->head; partition != NULL; - partition = partition->next) { - size += partition->count; - } - - return size; -} - -/**********************************************************************/ -int vdo_get_partition(struct fixed_layout *layout, - enum partition_id id, - struct partition **partition_ptr) -{ - struct partition *partition; - for (partition = layout->head; partition != NULL; - partition = partition->next) { - if (partition->id == id) { - if (partition_ptr != NULL) { - *partition_ptr = partition; - } - return VDO_SUCCESS; - } - } - - return VDO_UNKNOWN_PARTITION; -} - -/**********************************************************************/ -int vdo_translate_to_pbn(const struct partition *partition, - physical_block_number_t partition_block_number, - physical_block_number_t *layer_block_number) -{ - physical_block_number_t offset_from_base; - if (partition == NULL) { - *layer_block_number = partition_block_number; - return VDO_SUCCESS; - } - - if (partition_block_number < partition->base) { - return VDO_OUT_OF_RANGE; - } - - offset_from_base = partition_block_number - partition->base; - if (offset_from_base >= partition->count) { - return VDO_OUT_OF_RANGE; - } - - *layer_block_number = partition->offset + offset_from_base; - return VDO_SUCCESS; -} - -/**********************************************************************/ -int vdo_translate_from_pbn(const struct partition *partition, - physical_block_number_t layer_block_number, - physical_block_number_t *partition_block_number_ptr) -{ - physical_block_number_t partition_block_number; - - if (partition == NULL) { - *partition_block_number_ptr = layer_block_number; - return VDO_SUCCESS; - } - - if (layer_block_number < partition->offset) { - return VDO_OUT_OF_RANGE; - } - - partition_block_number = layer_block_number - partition->offset; - if (partition_block_number >= partition->count) { - return VDO_OUT_OF_RANGE; - } - - *partition_block_number_ptr = partition_block_number + partition->base; - return VDO_SUCCESS; -} - -/**********************************************************************/ -block_count_t -get_vdo_fixed_layout_blocks_available(const struct fixed_layout *layout) -{ - return layout->last_free - layout->first_free; -} - -/** - * Allocate a partition. The partition will be attached to the partition - * list in the layout. - * - * @param layout The layout containing the partition - * @param id The id of the partition - * @param offset The offset into the layout at which the partition begins - * @param base The number of the first block for users of the partition - * @param block_count The number of blocks in the partition - * - * @return VDO_SUCCESS or an error - **/ -static int allocate_partition(struct fixed_layout *layout, - byte id, - physical_block_number_t offset, - physical_block_number_t base, - block_count_t block_count) -{ - struct partition *partition; - int result = UDS_ALLOCATE(1, struct partition, - "fixed layout partition", &partition); - if (result != UDS_SUCCESS) { - return result; - } - - partition->id = id; - partition->layout = layout; - partition->offset = offset; - partition->base = base; - partition->count = block_count; - partition->next = layout->head; - layout->head = partition; - - return VDO_SUCCESS; -} - -/**********************************************************************/ -int make_vdo_fixed_layout_partition(struct fixed_layout *layout, - enum partition_id id, - block_count_t block_count, - enum partition_direction direction, - physical_block_number_t base) -{ - int result; - physical_block_number_t offset; - - block_count_t free_blocks = layout->last_free - layout->first_free; - if (block_count == VDO_ALL_FREE_BLOCKS) { - if (free_blocks == 0) { - return VDO_NO_SPACE; - } else { - block_count = free_blocks; - } - } else if (block_count > free_blocks) { - return VDO_NO_SPACE; - } - - result = vdo_get_partition(layout, id, NULL); - if (result != VDO_UNKNOWN_PARTITION) { - return VDO_PARTITION_EXISTS; - } - - offset = ((direction == VDO_PARTITION_FROM_END) ? - (layout->last_free - block_count) : layout->first_free); - result = allocate_partition(layout, id, offset, base, block_count); - if (result != VDO_SUCCESS) { - return result; - } - - layout->num_partitions++; - if (direction == VDO_PARTITION_FROM_END) { - layout->last_free = layout->last_free - block_count; - } else { - layout->first_free += block_count; - } - - return VDO_SUCCESS; -} - -/**********************************************************************/ -block_count_t -get_vdo_fixed_layout_partition_size(const struct partition *partition) -{ - return partition->count; -} - -/**********************************************************************/ -physical_block_number_t -get_vdo_fixed_layout_partition_offset(const struct partition *partition) -{ - return partition->offset; -} - -/**********************************************************************/ -physical_block_number_t -get_vdo_fixed_layout_partition_base(const struct partition *partition) -{ - return partition->base; -} - -/**********************************************************************/ -static inline size_t get_encoded_size(const struct fixed_layout *layout) -{ - return sizeof(struct layout_3_0) + - (sizeof(struct partition_3_0) * layout->num_partitions); -} - -/**********************************************************************/ -size_t get_vdo_fixed_layout_encoded_size(const struct fixed_layout *layout) -{ - return VDO_ENCODED_HEADER_SIZE + get_encoded_size(layout); -} - -/** - * Encode a null-terminated list of fixed layout partitions into a buffer - * using partition format 3.0. - * - * @param layout The layout containing the list of partitions to encode - * @param buffer A buffer positioned at the start of the encoding - * - * @return UDS_SUCCESS or an error code - **/ -static int encode_partitions_3_0(const struct fixed_layout *layout, - struct buffer *buffer) -{ - const struct partition *partition; - for (partition = layout->head; - partition != NULL; - partition = partition->next) { - int result; - STATIC_ASSERT_SIZEOF(enum partition_id, sizeof(byte)); - result = put_byte(buffer, partition->id); - if (result != UDS_SUCCESS) { - return result; - } - - result = put_uint64_le_into_buffer(buffer, partition->offset); - if (result != UDS_SUCCESS) { - return result; - } - - result = put_uint64_le_into_buffer(buffer, partition->base); - if (result != UDS_SUCCESS) { - return result; - } - - result = put_uint64_le_into_buffer(buffer, partition->count); - if (result != UDS_SUCCESS) { - return result; - } - } - - return UDS_SUCCESS; -} - -/** - * Encode the header fields of a fixed layout into a buffer using layout - * format 3.0. - * - * @param layout The layout to encode - * @param buffer A buffer positioned at the start of the encoding - * - * @return UDS_SUCCESS or an error code - **/ -static int encode_layout_3_0(const struct fixed_layout *layout, - struct buffer *buffer) -{ - int result = ASSERT(layout->num_partitions <= UINT8_MAX, - "fixed layout partition count must fit in a byte"); - if (result != UDS_SUCCESS) { - return result; - } - - result = put_uint64_le_into_buffer(buffer, layout->first_free); - if (result != UDS_SUCCESS) { - return result; - } - - result = put_uint64_le_into_buffer(buffer, layout->last_free); - if (result != UDS_SUCCESS) { - return result; - } - - return put_byte(buffer, layout->num_partitions); -} - -/**********************************************************************/ -int encode_vdo_fixed_layout(const struct fixed_layout *layout, - struct buffer *buffer) -{ - size_t initial_length, encoded_size; - int result; - - struct header header = LAYOUT_HEADER_3_0; - - if (!ensure_available_space(buffer, - get_vdo_fixed_layout_encoded_size(layout))) { - return UDS_BUFFER_ERROR; - } - - header.size = get_encoded_size(layout); - result = encode_vdo_header(&header, buffer); - if (result != UDS_SUCCESS) { - return result; - } - - initial_length = content_length(buffer); - - result = encode_layout_3_0(layout, buffer); - if (result != UDS_SUCCESS) { - return result; - } - - encoded_size = content_length(buffer) - initial_length; - result = ASSERT(encoded_size == sizeof(struct layout_3_0), - "encoded size of fixed layout header must match structure"); - if (result != UDS_SUCCESS) { - return result; - } - - result = encode_partitions_3_0(layout, buffer); - if (result != UDS_SUCCESS) { - return result; - } - - encoded_size = content_length(buffer) - initial_length; - return ASSERT(encoded_size == header.size, - "encoded size of fixed layout must match header size"); -} - -/** - * Decode a sequence of fixed layout partitions from a buffer - * using partition format 3.0. - * - * @param buffer A buffer positioned at the start of the encoding - * @param layout The layout in which to allocate the decoded partitions - * - * @return UDS_SUCCESS or an error code - **/ -static int decode_partitions_3_0(struct buffer *buffer, - struct fixed_layout *layout) -{ - size_t i; - for (i = 0; i < layout->num_partitions; i++) { - byte id; - uint64_t offset, base, count; - int result = get_byte(buffer, &id); - if (result != UDS_SUCCESS) { - return result; - } - - result = get_uint64_le_from_buffer(buffer, &offset); - if (result != UDS_SUCCESS) { - return result; - } - - result = get_uint64_le_from_buffer(buffer, &base); - if (result != UDS_SUCCESS) { - return result; - } - - result = get_uint64_le_from_buffer(buffer, &count); - if (result != UDS_SUCCESS) { - return result; - } - - result = allocate_partition(layout, id, offset, base, count); - if (result != VDO_SUCCESS) { - return result; - } - } - - return UDS_SUCCESS; -} - -/** - * Decode the header fields of a fixed layout from a buffer using layout - * format 3.0. - * - * @param buffer A buffer positioned at the start of the encoding - * @param layout The structure to receive the decoded fields - * - * @return UDS_SUCCESS or an error code - **/ -static int decode_layout_3_0(struct buffer *buffer, struct layout_3_0 *layout) -{ - size_t decoded_size, initial_length = content_length(buffer); - physical_block_number_t first_free, last_free; - byte partition_count; - - int result = get_uint64_le_from_buffer(buffer, &first_free); - if (result != UDS_SUCCESS) { - return result; - } - - result = get_uint64_le_from_buffer(buffer, &last_free); - if (result != UDS_SUCCESS) { - return result; - } - - result = get_byte(buffer, &partition_count); - if (result != UDS_SUCCESS) { - return result; - } - - *layout = (struct layout_3_0) { - .first_free = first_free, - .last_free = last_free, - .partition_count = partition_count, - }; - - decoded_size = initial_length - content_length(buffer); - return ASSERT(decoded_size == sizeof(struct layout_3_0), - "decoded size of fixed layout header must match structure"); -} - -/**********************************************************************/ -int decode_vdo_fixed_layout(struct buffer *buffer, - struct fixed_layout **layout_ptr) -{ - struct header header; - struct layout_3_0 layout_header; - struct fixed_layout *layout; - - int result = decode_vdo_header(buffer, &header); - if (result != UDS_SUCCESS) { - return result; - } - - // Layout is variable size, so only do a minimum size check here. - result = validate_vdo_header(&LAYOUT_HEADER_3_0, &header, false, __func__); - if (result != VDO_SUCCESS) { - return result; - } - - result = decode_layout_3_0(buffer, &layout_header); - if (result != UDS_SUCCESS) { - return result; - } - - if (content_length(buffer) < - (sizeof(struct partition_3_0) * layout_header.partition_count)) { - return VDO_UNSUPPORTED_VERSION; - } - - result = UDS_ALLOCATE(1, struct fixed_layout, "fixed layout", &layout); - if (result != UDS_SUCCESS) { - return result; - } - - layout->first_free = layout_header.first_free; - layout->last_free = layout_header.last_free; - layout->num_partitions = layout_header.partition_count; - - result = decode_partitions_3_0(buffer, layout); - if (result != VDO_SUCCESS) { - free_vdo_fixed_layout(layout); - return result; - } - - *layout_ptr = layout; - return VDO_SUCCESS; -} - -/**********************************************************************/ -int make_partitioned_vdo_fixed_layout(block_count_t physical_blocks, - physical_block_number_t starting_offset, - block_count_t block_map_blocks, - block_count_t journal_blocks, - block_count_t summary_blocks, - struct fixed_layout **layout_ptr) -{ - struct fixed_layout *layout; - int result; - - block_count_t necessary_size = (starting_offset + block_map_blocks + - journal_blocks + summary_blocks); - if (necessary_size > physical_blocks) { - return uds_log_error_strerror(VDO_NO_SPACE, - "Not enough space to make a VDO"); - } - - result = make_vdo_fixed_layout(physical_blocks - starting_offset, - starting_offset, - &layout); - if (result != VDO_SUCCESS) { - return result; - } - - result = make_vdo_fixed_layout_partition(layout, - BLOCK_MAP_PARTITION, - block_map_blocks, - VDO_PARTITION_FROM_BEGINNING, - 0); - if (result != VDO_SUCCESS) { - free_vdo_fixed_layout(layout); - return result; - } - - result = make_vdo_fixed_layout_partition(layout, SLAB_SUMMARY_PARTITION, - summary_blocks, - VDO_PARTITION_FROM_END, 0); - if (result != VDO_SUCCESS) { - free_vdo_fixed_layout(layout); - return result; - } - - result = make_vdo_fixed_layout_partition(layout, - RECOVERY_JOURNAL_PARTITION, - journal_blocks, - VDO_PARTITION_FROM_END, 0); - if (result != VDO_SUCCESS) { - free_vdo_fixed_layout(layout); - return result; - } - - /* - * The block allocator no longer traffics in relative PBNs so the offset - * doesn't matter. We need to keep this partition around both for - * upgraded systems, and because we decided that all of the usable space - * in the volume, other than the super block, should be part of some - * partition. - */ - result = make_vdo_fixed_layout_partition(layout, - BLOCK_ALLOCATOR_PARTITION, - VDO_ALL_FREE_BLOCKS, - VDO_PARTITION_FROM_BEGINNING, - block_map_blocks); - if (result != VDO_SUCCESS) { - free_vdo_fixed_layout(layout); - return result; - } - - *layout_ptr = layout; - return VDO_SUCCESS; -} diff --git a/vdo/fixedLayout.h b/vdo/fixedLayout.h deleted file mode 100644 index 9d45e02f..00000000 --- a/vdo/fixedLayout.h +++ /dev/null @@ -1,235 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/fixedLayout.h#11 $ - */ - -#ifndef FIXED_LAYOUT_H -#define FIXED_LAYOUT_H - -#include "buffer.h" - -#include "types.h" - -enum partition_direction { - VDO_PARTITION_FROM_BEGINNING, - VDO_PARTITION_FROM_END, -}; - -extern const block_count_t VDO_ALL_FREE_BLOCKS; - -/** - * A fixed layout is like a traditional disk partitioning scheme. In the - * beginning there is one large unused area, of which parts are carved off. - * Each carved off section has its own internal offset and size. - **/ -struct fixed_layout; -struct partition; - -/** - * Make an unpartitioned fixed layout. - * - * @param [in] total_blocks The total size of the layout, in blocks - * @param [in] start_offset The block offset in the underlying layer at which - * the fixed layout begins - * @param [out] layout_ptr The pointer to hold the resulting layout - * - * @return a success or error code - **/ -int __must_check make_vdo_fixed_layout(block_count_t total_blocks, - physical_block_number_t start_offset, - struct fixed_layout **layout_ptr); - -/** - * Free a fixed layout. - * - * @param layout The layout to free - * - * @note all partitions created by this layout become invalid pointers - **/ -void free_vdo_fixed_layout(struct fixed_layout *layout); - -/** - * Get the total size of the layout in blocks. - * - * @param layout The layout - * - * @return The size of the layout - **/ -block_count_t __must_check -get_total_vdo_fixed_layout_size(const struct fixed_layout *layout); - -/** - * Get a partition by id. - * - * @param layout The layout from which to get a partition - * @param id The id of the partition - * @param partition_ptr A pointer to hold the partition - * - * @return VDO_SUCCESS or an error - **/ -int __must_check vdo_get_partition(struct fixed_layout *layout, - enum partition_id id, - struct partition **partition_ptr); - -/** - * Translate a block number from the partition's view to the layer's - * - * @param partition The partition to use for translation - * @param partition_block_number The block number relative to the partition - * @param layer_block_number The block number relative to the layer - * - * @return VDO_SUCCESS or an error code - **/ -int __must_check -vdo_translate_to_pbn(const struct partition *partition, - physical_block_number_t partition_block_number, - physical_block_number_t *layer_block_number); - -/** - * Translate a block number from the layer's view to the partition's. - * This is the inverse of vdo_translate_to_pbn(). - * - * @param partition The partition to use for translation - * @param layer_block_number The block number relative to the layer - * @param partition_block_number The block number relative to the partition - * - * @return VDO_SUCCESS or an error code - **/ -int __must_check -vdo_translate_from_pbn(const struct partition *partition, - physical_block_number_t layer_block_number, - physical_block_number_t *partition_block_number); - -/** - * Return the number of unallocated blocks available. - * - * @param layout the fixed layout - * - * @return the number of blocks yet unallocated to partitions - **/ -block_count_t __must_check -get_vdo_fixed_layout_blocks_available(const struct fixed_layout *layout); - -/** - * Create a new partition from the beginning or end of the unused space - * within a fixed layout. - * - * @param layout the fixed layout - * @param id the id of the partition to make - * @param block_count the number of blocks to carve out, if set - * to VDO_ALL_FREE_BLOCKS, all remaining blocks will - * be used - * @param direction whether to carve out from beginning or end - * @param base the number of the first block in the partition - * from the point of view of its users - * - * @return a success or error code, particularly - * VDO_NO_SPACE if there are less than block_count blocks remaining - **/ -int __must_check -make_vdo_fixed_layout_partition(struct fixed_layout *layout, - enum partition_id id, - block_count_t block_count, - enum partition_direction direction, - physical_block_number_t base); - -/** - * Return the size in blocks of a partition. - * - * @param partition a partition of the fixed_layout - * - * @return the size of the partition in blocks - **/ -block_count_t __must_check -get_vdo_fixed_layout_partition_size(const struct partition *partition); - -/** - * Get the first block of the partition in the layout. - * - * @param partition a partition of the fixed_layout - * - * @return the partition's offset in blocks - **/ -physical_block_number_t __must_check -get_vdo_fixed_layout_partition_offset(const struct partition *partition); - -/** - * Get the number of the first block in the partition from the partition users - * point of view. - * - * @param partition a partition of the fixed_layout - * - * @return the number of the first block in the partition - **/ -physical_block_number_t __must_check -get_vdo_fixed_layout_partition_base(const struct partition *partition); - -/** - * Get the size of an encoded layout - * - * @param layout The layout - * - * @return The encoded size of the layout - **/ -size_t __must_check -get_vdo_fixed_layout_encoded_size(const struct fixed_layout *layout); - -/** - * Encode a layout into a buffer. - * - * @param layout The layout to encode - * @param buffer The buffer to encode into - * - * @return UDS_SUCCESS or an error - **/ -int __must_check -encode_vdo_fixed_layout(const struct fixed_layout *layout, struct buffer *buffer); - -/** - * Decode a fixed layout from a buffer. - * - * @param [in] buffer The buffer from which to decode - * @param [out] layout_ptr A pointer to hold the layout - * - * @return VDO_SUCCESS or an error - **/ -int __must_check -decode_vdo_fixed_layout(struct buffer *buffer, struct fixed_layout **layout_ptr); - -/** - * Make a partitioned fixed layout for a VDO. - * - * @param [in] physical_blocks The number of physical blocks in the VDO - * @param [in] starting_offset The starting offset of the layout - * @param [in] block_map_blocks The size of the block map partition - * @param [in] journal_blocks The size of the journal partition - * @param [in] summary_blocks The size of the slab summary partition - * @param [out] layout_ptr A pointer to hold the new fixed_layout - * - * @return VDO_SUCCESS or an error - **/ -int __must_check -make_partitioned_vdo_fixed_layout(block_count_t physical_blocks, - physical_block_number_t starting_offset, - block_count_t block_map_blocks, - block_count_t journal_blocks, - block_count_t summary_blocks, - struct fixed_layout **layout_ptr); - -#endif // FIXED_LAYOUT_H diff --git a/vdo/flush.c b/vdo/flush.c index ebc08d28..a2547da3 100644 --- a/vdo/flush.c +++ b/vdo/flush.c @@ -1,42 +1,28 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/flush.c#43 $ */ #include "flush.h" +#include + #include "logger.h" -#include "memoryAlloc.h" +#include "memory-alloc.h" #include "permassert.h" -#include "ioSubmitter.h" -#include "kernelLayer.h" -#include "kernelVDO.h" - -#include "adminState.h" -#include "blockAllocator.h" +#include "admin-state.h" +#include "block-allocator.h" #include "completion.h" -#include "logicalZone.h" -#include "numUtils.h" -#include "readOnlyNotifier.h" -#include "slabDepot.h" -#include "vdoInternal.h" +#include "io-submitter.h" +#include "kernel-types.h" +#include "logical-zone.h" +#include "num-utils.h" +#include "read-only-notifier.h" +#include "slab-depot.h" +#include "thread-config.h" +#include "types.h" +#include "vdo.h" struct flusher { struct vdo_completion completion; @@ -64,16 +50,17 @@ struct flusher { struct bio_list waiting_flush_bios; /** The lock to protect the previous two fields */ spinlock_t lock; - /** When the longest waiting flush bio arrived */ - uint64_t flush_arrival_jiffies; + /** The rotor for selecting the bio queue for submitting flush bios */ + zone_count_t bio_queue_rotor; + /** The number of flushes submitted to the current bio queue */ + int flush_count; }; /** - * Check that we are on the flusher thread. - * - * @param flusher The flusher - * @param caller The function which is asserting - **/ + * assert_on_flusher_thread() - Check that we are on the flusher thread. + * @flusher: The flusher. + * @caller: The function which is asserting. + */ static inline void assert_on_flusher_thread(struct flusher *flusher, const char *caller) { @@ -83,55 +70,76 @@ static inline void assert_on_flusher_thread(struct flusher *flusher, } /** - * Convert a generic vdo_completion to a flusher. + * as_flusher() - Convert a generic vdo_completion to a flusher. + * @completion: The completion to convert. * - * @param completion The completion to convert - * - * @return The completion as a flusher - **/ + * Return: The completion as a flusher. + */ static struct flusher *as_flusher(struct vdo_completion *completion) { - assert_vdo_completion_type(completion->type, + vdo_assert_completion_type(completion->type, VDO_FLUSH_NOTIFICATION_COMPLETION); return container_of(completion, struct flusher, completion); } /** - * Convert a vdo_flush's generic wait queue entry back to the vdo_flush. + * completion_as_vdo_flush() - Convert a generic vdo_completion to a + * vdo_flush. + * @completion: The completion to convert. * - * @param waiter The wait queue entry to convert + * Return: The completion as a vdo_flush. + */ +static inline struct vdo_flush * +completion_as_vdo_flush(struct vdo_completion *completion) +{ + vdo_assert_completion_type(completion->type, VDO_FLUSH_COMPLETION); + return container_of(completion, struct vdo_flush, completion); +} + +/** + * waiter_as_flush() - Convert a vdo_flush's generic wait queue entry back to + * the vdo_flush. + * @waiter: The wait queue entry to convert. * - * @return The wait queue entry as a vdo_flush - **/ + * Return: The wait queue entry as a vdo_flush. + */ static struct vdo_flush *waiter_as_flush(struct waiter *waiter) { return container_of(waiter, struct vdo_flush, waiter); } -/**********************************************************************/ -int make_vdo_flusher(struct vdo *vdo) +/** + * vdo_make_flusher() - Make a flusher for a vdo. + * @vdo: The vdo which owns the flusher. + * + * Return: VDO_SUCCESS or an error. + */ +int vdo_make_flusher(struct vdo *vdo) { int result = UDS_ALLOCATE(1, struct flusher, __func__, &vdo->flusher); + if (result != VDO_SUCCESS) { return result; } vdo->flusher->vdo = vdo; vdo->flusher->thread_id = vdo->thread_config->packer_thread; - set_vdo_admin_state_code(&vdo->flusher->state, + vdo_set_admin_state_code(&vdo->flusher->state, VDO_ADMIN_STATE_NORMAL_OPERATION); - initialize_vdo_completion(&vdo->flusher->completion, vdo, + vdo_initialize_completion(&vdo->flusher->completion, vdo, VDO_FLUSH_NOTIFICATION_COMPLETION); spin_lock_init(&vdo->flusher->lock); bio_list_init(&vdo->flusher->waiting_flush_bios); - result = UDS_ALLOCATE(1, struct vdo_flush, __func__, - &vdo->flusher->spare_flush); - return result; + return UDS_ALLOCATE(1, struct vdo_flush, __func__, + &vdo->flusher->spare_flush); } -/**********************************************************************/ -void free_vdo_flusher(struct flusher *flusher) +/** + * vdo_free_flusher() - Free a flusher. + * @flusher: The flusher to free. + */ +void vdo_free_flusher(struct flusher *flusher) { if (flusher == NULL) { return; @@ -141,23 +149,30 @@ void free_vdo_flusher(struct flusher *flusher) UDS_FREE(flusher); } -/**********************************************************************/ -thread_id_t get_vdo_flusher_thread_id(struct flusher *flusher) +/** + * vdo_get_flusher_thread_id() - Get the ID of the thread on which flusher + * functions should be called. + * @flusher: The flusher to query. + * + * Return: The ID of the thread which handles the flusher. + */ +thread_id_t vdo_get_flusher_thread_id(struct flusher *flusher) { return flusher->thread_id; } -/**********************************************************************/ static void notify_flush(struct flusher *flusher); +static void vdo_complete_flush(struct vdo_flush *flush); /** - * Finish the notification process by checking if any flushes have completed + * finish_notification() - Finish the notification process. + * @completion: The flusher completion. + * + * Finishes the notification process by checking if any flushes have completed * and then starting the notification of the next flush request if one came in * while the current notification was in progress. This callback is registered * in flush_packer_callback(). - * - * @param completion The flusher completion - **/ + */ static void finish_notification(struct vdo_completion *completion) { struct waiter *waiter; @@ -170,102 +185,111 @@ static void finish_notification(struct vdo_completion *completion) result = enqueue_waiter(&flusher->pending_flushes, waiter); if (result != VDO_SUCCESS) { struct vdo_flush *flush = waiter_as_flush(waiter); + vdo_enter_read_only_mode(flusher->vdo->read_only_notifier, result); vdo_complete_flush(flush); return; } - complete_vdo_flushes(flusher); + vdo_complete_flushes(flusher); if (has_waiters(&flusher->notifiers)) { notify_flush(flusher); } } /** - * Flush the packer now that all of the logical and physical zones have been + * flush_packer_callback() - Flush the packer. + * @completion: The flusher completion. + * + * Flushes the packer now that all of the logical and physical zones have been * notified of the new flush request. This callback is registered in * increment_generation(). - * - * @param completion The flusher completion - **/ + */ static void flush_packer_callback(struct vdo_completion *completion) { struct flusher *flusher = as_flusher(completion); - increment_vdo_packer_flush_generation(flusher->vdo->packer); - launch_vdo_completion_callback(completion, finish_notification, + + vdo_increment_packer_flush_generation(flusher->vdo->packer); + vdo_launch_completion_callback(completion, finish_notification, flusher->thread_id); } /** - * Increment the flush generation in a logical zone. If there are more logical - * zones, go on to the next one, otherwise, prepare the physical zones. This - * callback is registered both in notify_flush() and in itself. + * increment_generation() - Increment the flush generation in a logical zone. + * @completion: The flusher as a completion. * - * @param completion The flusher as a completion - **/ + * If there are more logical zones, go on to the next one, otherwise, prepare + * the physical zones. This callback is registered both in notify_flush() and + * in itself. + */ static void increment_generation(struct vdo_completion *completion) { - thread_id_t thread_id; struct flusher *flusher = as_flusher(completion); - increment_vdo_logical_zone_flush_generation(flusher->logical_zone_to_notify, - flusher->notify_generation); - flusher->logical_zone_to_notify = - get_next_vdo_logical_zone(flusher->logical_zone_to_notify); - if (flusher->logical_zone_to_notify == NULL) { - launch_vdo_completion_callback(completion, + struct logical_zone *zone = flusher->logical_zone_to_notify; + + vdo_increment_logical_zone_flush_generation(zone, + flusher->notify_generation); + if (zone->next == NULL) { + vdo_launch_completion_callback(completion, flush_packer_callback, flusher->thread_id); return; } - thread_id = - get_vdo_logical_zone_thread_id(flusher->logical_zone_to_notify); - launch_vdo_completion_callback(completion, + flusher->logical_zone_to_notify = zone->next; + vdo_launch_completion_callback(completion, increment_generation, - thread_id); + flusher->logical_zone_to_notify->thread_id); } /** - * Lauch a flush notification. - * - * @param flusher The flusher doing the notification - **/ + * notify_flush() - Lauch a flush notification. + * @flusher: The flusher doing the notification. + */ static void notify_flush(struct flusher *flusher) { - thread_id_t thread_id; struct vdo_flush *flush = waiter_as_flush(get_first_waiter(&flusher->notifiers)); + flusher->notify_generation = flush->flush_generation; - flusher->logical_zone_to_notify = - get_vdo_logical_zone(flusher->vdo->logical_zones, 0); + flusher->logical_zone_to_notify + = &flusher->vdo->logical_zones->zones[0]; flusher->completion.requeue = true; - - thread_id = - get_vdo_logical_zone_thread_id(flusher->logical_zone_to_notify); - launch_vdo_completion_callback(&flusher->completion, + vdo_launch_completion_callback(&flusher->completion, increment_generation, - thread_id); + flusher->logical_zone_to_notify->thread_id); } -/**********************************************************************/ -void flush_vdo(struct vdo_work_item *item) +/** + * flush_vdo() - Start processing a flush request. + * @completion: A flush request (as a vdo_completion) + * + * This callback is registered in launch_flush(). + */ +static void flush_vdo(struct vdo_completion *completion) { - struct vdo_flush *flush = container_of(item, - struct vdo_flush, - work_item); - struct flusher *flusher = flush->vdo->flusher; + struct vdo_flush *flush = completion_as_vdo_flush(completion); + struct flusher *flusher = completion->vdo->flusher; bool may_notify; int result; assert_on_flusher_thread(flusher, __func__); + result = ASSERT(vdo_is_state_normal(&flusher->state), + "flusher is in normal operation"); + if (result != VDO_SUCCESS) { + vdo_enter_read_only_mode(flusher->vdo->read_only_notifier, + result); + vdo_complete_flush(flush); + return; + } flush->flush_generation = flusher->flush_generation++; may_notify = !has_waiters(&flusher->notifiers); result = enqueue_waiter(&flusher->notifiers, &flush->waiter); if (result != VDO_SUCCESS) { - vdo_enter_read_only_mode(flush->vdo->read_only_notifier, + vdo_enter_read_only_mode(flusher->vdo->read_only_notifier, result); vdo_complete_flush(flush); return; @@ -277,15 +301,14 @@ void flush_vdo(struct vdo_work_item *item) } /** - * Check whether the flusher has drained. - * - * @param flusher The flusher - **/ + * check_for_drain_complete() - Check whether the flusher has drained. + * @flusher: The flusher. + */ static void check_for_drain_complete(struct flusher *flusher) { - bool drained = true; + bool drained; - if (!is_vdo_state_draining(&flusher->state) + if (!vdo_is_state_draining(&flusher->state) || has_waiters(&flusher->pending_flushes)) { return; } @@ -295,24 +318,28 @@ static void check_for_drain_complete(struct flusher *flusher) spin_unlock(&flusher->lock); if (drained) { - finish_vdo_draining(&flusher->state); + vdo_finish_draining(&flusher->state); } } -/**********************************************************************/ -void complete_vdo_flushes(struct flusher *flusher) +/** + * vdo_complete_flushes() - Attempt to complete any flushes which might have + * finished. + * @flusher: The flusher. + */ +void vdo_complete_flushes(struct flusher *flusher) { sequence_number_t oldest_active_generation = UINT64_MAX; struct logical_zone *zone; assert_on_flusher_thread(flusher, __func__); - for (zone = get_vdo_logical_zone(flusher->vdo->logical_zones, 0); - zone != NULL; zone = get_next_vdo_logical_zone(zone)) { - sequence_number_t oldest_in_zone = - get_vdo_logical_zone_oldest_locked_generation(zone); + for (zone = &flusher->vdo->logical_zones->zones[0]; + zone != NULL; + zone = zone->next) { oldest_active_generation = - min(oldest_active_generation, oldest_in_zone); + min(oldest_active_generation, + READ_ONCE(zone->oldest_active_generation)); } while (has_waiters(&flusher->pending_flushes)) { @@ -335,8 +362,11 @@ void complete_vdo_flushes(struct flusher *flusher) check_for_drain_complete(flusher); } -/**********************************************************************/ -void dump_vdo_flusher(const struct flusher *flusher) +/** + * vdo_dump_flusher() - Dump the flusher, in a thread-unsafe fashion. + * @flusher: The flusher. + */ +void vdo_dump_flusher(const struct flusher *flusher) { uds_log_info("struct flusher"); uds_log_info(" flush_generation=%llu first_unacknowledged_generation=%llu", @@ -349,109 +379,132 @@ void dump_vdo_flusher(const struct flusher *flusher) /** - * Initialize a vdo_flush structure, transferring all the bios in the flusher's - * waiting_flush_bios list to it. The caller MUST already hold the lock. + * initialize_flush() - Initialize a vdo_flush structure. + * @flush: The flush to initialize. + * @vdo: The vdo being flushed. * - * @param flush The flush to initialize - * @param vdo The vdo being flushed - **/ + * Initializes a vdo_flush structure, transferring all the bios in the + * flusher's waiting_flush_bios list to it. The caller MUST already hold the + * lock. + */ static void initialize_flush(struct vdo_flush *flush, struct vdo *vdo) { - flush->vdo = vdo; + vdo_initialize_completion(&flush->completion, + vdo, + VDO_FLUSH_COMPLETION); bio_list_init(&flush->bios); bio_list_merge(&flush->bios, &vdo->flusher->waiting_flush_bios); bio_list_init(&vdo->flusher->waiting_flush_bios); - flush->arrival_jiffies = vdo->flusher->flush_arrival_jiffies; } -/**********************************************************************/ -static void enqueue_flush(struct vdo_flush *flush) +static void launch_flush(struct vdo_flush *flush) { - setup_work_item(&flush->work_item, - flush_vdo, - NULL, - REQ_Q_ACTION_FLUSH); - enqueue_vdo_work(flush->vdo, - &flush->work_item, - flush->vdo->flusher->thread_id); + struct vdo_completion *completion = &flush->completion; + + vdo_prepare_completion(completion, + flush_vdo, + flush_vdo, + completion->vdo->thread_config->packer_thread, + NULL); + vdo_enqueue_completion_with_priority(completion, + VDO_DEFAULT_Q_FLUSH_PRIORITY); } -/**********************************************************************/ -void launch_vdo_flush(struct vdo *vdo, struct bio *bio) +/** + * vdo_launch_flush() - Function called to start processing a flush request. + * @vdo: The vdo. + * @bio: The bio containing an empty flush request. + * + * This is called when we receive an empty flush bio from the block layer, and + * before acknowledging a non-empty bio with the FUA flag set. + */ +void vdo_launch_flush(struct vdo *vdo, struct bio *bio) { - // Try to allocate a vdo_flush to represent the flush request. If the - // allocation fails, we'll deal with it later. - struct vdo_flush *flush = UDS_ALLOCATE_NOWAIT(struct vdo_flush, __func__); + /* + * Try to allocate a vdo_flush to represent the flush request. If the + * allocation fails, we'll deal with it later. + */ + struct vdo_flush *flush + = UDS_ALLOCATE_NOWAIT(struct vdo_flush, __func__); struct flusher *flusher = vdo->flusher; - spin_lock(&flusher->lock); + const struct admin_state_code *code = + vdo_get_admin_state_code(&flusher->state); - // We have a new bio to start. Add it to the list. If it becomes the - // only entry on the list, record the time. - if (bio_list_empty(&flusher->waiting_flush_bios)) { - flusher->flush_arrival_jiffies = jiffies; - } + ASSERT_LOG_ONLY(!code->quiescent, + "Flushing not allowed in state %s", + code->name); + + spin_lock(&flusher->lock); + /* We have a new bio to start. Add it to the list. */ bio_list_add(&flusher->waiting_flush_bios, bio); if (flush == NULL) { - // The vdo_flush allocation failed. Try to use the spare - // vdo_flush structure. + /* + * The vdo_flush allocation failed. Try to use the spare + * vdo_flush structure. + */ if (flusher->spare_flush == NULL) { - // The spare is already in use. This bio is on - // waiting_flush_bios and it will be handled by a flush - // completion or by a bio that can allocate. + /* + * The spare is already in use. This bio is on + * waiting_flush_bios and it will be handled by a flush + * completion or by a bio that can allocate. + */ spin_unlock(&flusher->lock); return; } - // Take and use the spare flush request. + /* Take and use the spare flush request. */ flush = flusher->spare_flush; flusher->spare_flush = NULL; } - // We have flushes to start. Capture them in the vdo_flush structure. + /* We have flushes to start. Capture them in the vdo_flush structure. */ initialize_flush(flush, vdo); spin_unlock(&flusher->lock); - // Finish launching the flushes. - enqueue_flush(flush); + /* Finish launching the flushes. */ + launch_flush(flush); } /** - * Release a vdo_flush structure that has completed its work. If there are any - * pending flush requests whose vdo_flush allocation failed, they will be - * launched by immediately re-using the released vdo_flush. If there is no - * spare vdo_flush, the released structure will become the spare. Otherwise, - * the vdo_flush will be freed. + * release_flush() - Release a vdo_flush structure that has completed its + * work. + * @flush: The completed flush structure to re-use or free. * - * @param flush The completed flush structure to re-use or free - **/ + * If there are any pending flush requests whose vdo_flush allocation failed, + * they will be launched by immediately re-using the released vdo_flush. If + * there is no spare vdo_flush, the released structure will become the spare. + * Otherwise, the vdo_flush will be freed. + */ static void release_flush(struct vdo_flush *flush) { - struct flusher *flusher = flush->vdo->flusher; bool relaunch_flush = false; + struct flusher *flusher = flush->completion.vdo->flusher; spin_lock(&flusher->lock); if (bio_list_empty(&flusher->waiting_flush_bios)) { - // Nothing needs to be started. Save one spare flush request. + /* Nothing needs to be started. Save one spare flush request. */ if (flusher->spare_flush == NULL) { - // Make the new spare all zero, just like a newly - // allocated one. + /* + * Make the new spare all zero, just like a newly + * allocated one. + */ memset(flush, 0, sizeof(*flush)); flusher->spare_flush = flush; flush = NULL; } } else { - // We have flushes to start. Capture them in a flush request. + /* We have flushes to start. Capture them in a flush request. */ initialize_flush(flush, flusher->vdo); relaunch_flush = true; } spin_unlock(&flusher->lock); if (relaunch_flush) { - // Finish launching the flushes. - enqueue_flush(flush); + /* Finish launching the flushes. */ + launch_flush(flush); return; } @@ -461,76 +514,119 @@ static void release_flush(struct vdo_flush *flush) } /** - * Function called to complete and free a flush request - * - * @param item The flush-request work item - **/ -static void vdo_complete_flush_work(struct vdo_work_item *item) + * vdo_complete_flush_callback() - Function called to complete and free a + * flush request, registered in + * vdo_complete_flush(). + * @completion: The flush request. + */ +static void vdo_complete_flush_callback(struct vdo_completion *completion) { - struct vdo_flush *flush = container_of(item, - struct vdo_flush, - work_item); - struct vdo *vdo = flush->vdo; + struct vdo_flush *flush = completion_as_vdo_flush(completion); + struct vdo *vdo = completion->vdo; struct bio *bio; while ((bio = bio_list_pop(&flush->bios)) != NULL) { - // We're not acknowledging this bio now, but we'll never touch - // it again, so this is the last chance to account for it. + /* + * We're not acknowledging this bio now, but we'll never touch + * it again, so this is the last chance to account for it. + */ vdo_count_bios(&vdo->stats.bios_acknowledged, bio); - // Update the device, and send it on down... - bio_set_dev(bio, get_vdo_backing_device(flush->vdo)); + /* Update the device, and send it on down... */ + bio_set_dev(bio, vdo_get_backing_device(vdo)); atomic64_inc(&vdo->stats.flush_out); -#if LINUX_VERSION_CODE < KERNEL_VERSION(5,9,0) - generic_make_request(bio); -#else submit_bio_noacct(bio); -#endif } - // Release the flush structure, freeing it, re-using it as the spare, - // or using it to launch any flushes that had to wait when allocations - // failed. + /* + * Release the flush structure, freeing it, re-using it as the spare, + * or using it to launch any flushes that had to wait when allocations + * failed. + */ release_flush(flush); } -/**********************************************************************/ -void vdo_complete_flush(struct vdo_flush *flush) +/** + * select_bio_queue() - Select the bio queue on which to finish a flush + * request. + * @flusher: The flusher finishing the request. + */ +static thread_id_t select_bio_queue(struct flusher *flusher) { - setup_work_item(&flush->work_item, - vdo_complete_flush_work, - NULL, - BIO_Q_ACTION_FLUSH); - vdo_enqueue_bio_work_item(flush->vdo->io_submitter, &flush->work_item); + struct vdo *vdo = flusher->vdo; + zone_count_t bio_threads + = flusher->vdo->thread_config->bio_thread_count; + int interval; + + if (bio_threads == 1) { + return vdo->thread_config->bio_threads[0]; + } + + interval = vdo->device_config->thread_counts.bio_rotation_interval; + if (flusher->flush_count == interval) { + flusher->flush_count = 1; + flusher->bio_queue_rotor = ((flusher->bio_queue_rotor + 1) + % bio_threads); + } else { + flusher->flush_count++; + } + + return vdo->thread_config->bio_threads[flusher->bio_queue_rotor]; } +/** + * vdo_complete_flush() - Complete and free a vdo flush request. + * @flush: The flush request. + */ +static void vdo_complete_flush(struct vdo_flush *flush) +{ + struct vdo_completion *completion = &flush->completion; + + vdo_prepare_completion(completion, + vdo_complete_flush_callback, + vdo_complete_flush_callback, + select_bio_queue(completion->vdo->flusher), + NULL); + vdo_enqueue_completion_with_priority(completion, BIO_Q_FLUSH_PRIORITY); +} /** - * Initiate a drain. + * initiate_drain() - Initiate a drain. * * Implements vdo_admin_initiator. - **/ + */ static void initiate_drain(struct admin_state *state) { check_for_drain_complete(container_of(state, struct flusher, state)); } -/**********************************************************************/ +/** + * vdo_drain_flusher() - Drain the flusher. + * @flusher: The flusher to drain. + * @completion: The completion to finish when the flusher has drained. + * + * Drains the flusher by preventing any more VIOs from entering the flusher + * and then flushing. The flusher will be left in the suspended state. + */ void vdo_drain_flusher(struct flusher *flusher, struct vdo_completion *completion) { assert_on_flusher_thread(flusher, __func__); - start_vdo_draining(&flusher->state, + vdo_start_draining(&flusher->state, VDO_ADMIN_STATE_SUSPENDING, completion, initiate_drain); } -/**********************************************************************/ +/** + * vdo_resume_flusher() - Resume a flusher which has been suspended. + * @flusher: The flusher to resume. + * @parent: The completion to finish when the flusher has resumed. + */ void vdo_resume_flusher(struct flusher *flusher, struct vdo_completion *parent) { assert_on_flusher_thread(flusher, __func__); - finish_vdo_completion(parent, - resume_vdo_if_quiescent(&flusher->state)); + vdo_finish_completion(parent, + vdo_resume_if_quiescent(&flusher->state)); } diff --git a/vdo/flush.h b/vdo/flush.h index 02d3ebe1..0022a698 100644 --- a/vdo/flush.h +++ b/vdo/flush.h @@ -1,130 +1,47 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/flush.h#13 $ */ #ifndef FLUSH_H #define FLUSH_H #include "bio.h" -#include "workQueue.h" - +#include "completion.h" +#include "kernel-types.h" #include "types.h" -#include "waitQueue.h" +#include "wait-queue.h" +#include "workQueue.h" -/** +/* * A marker for tracking which journal entries are affected by a flush request. - **/ + */ struct vdo_flush { - /** The work item for enqueueing this flush request. */ - struct vdo_work_item work_item; - /** The vdo to flush */ - struct vdo *vdo; - /** The flush bios covered by this request */ + /* The completion for enqueueing this flush request. */ + struct vdo_completion completion; + /* The flush bios covered by this request */ struct bio_list bios; - /** Time when the earlier bio arrived */ - uint64_t arrival_jiffies; - /** The wait queue entry for this flush */ + /* The wait queue entry for this flush */ struct waiter waiter; - /** Which flush this struct represents */ + /* Which flush this struct represents */ sequence_number_t flush_generation; }; -/** - * Make a flusher for a vdo. - * - * @param vdo The vdo which owns the flusher - * - * @return VDO_SUCCESS or an error - **/ -int __must_check make_vdo_flusher(struct vdo *vdo); - -/** - * Free a flusher. - * - * @param flusher The flusher to free - **/ -void free_vdo_flusher(struct flusher *flusher); - -/** - * Get the ID of the thread on which flusher functions should be called. - * - * @param flusher The flusher to query - * - * @return The ID of the thread which handles the flusher - **/ -thread_id_t __must_check get_vdo_flusher_thread_id(struct flusher *flusher); +int __must_check vdo_make_flusher(struct vdo *vdo); -/** - * Handle empty flush requests. - * - * @param item A flush request (as a work_item) - **/ -void flush_vdo(struct vdo_work_item *item); +void vdo_free_flusher(struct flusher *flusher); -/** - * Attempt to complete any flushes which might have finished. - * - * @param flusher The flusher - **/ -void complete_vdo_flushes(struct flusher *flusher); +thread_id_t __must_check vdo_get_flusher_thread_id(struct flusher *flusher); -/** - * Dump the flusher, in a thread-unsafe fashion. - * - * @param flusher The flusher - **/ -void dump_vdo_flusher(const struct flusher *flusher); +void vdo_complete_flushes(struct flusher *flusher); -/** - * Complete and free a vdo flush request. - * - * @param flush The flush request - **/ -void vdo_complete_flush(struct vdo_flush *flush); +void vdo_dump_flusher(const struct flusher *flusher); -/** - * Function called to start processing a flush request. It is called when we - * receive an empty flush bio from the block layer, and before acknowledging a - * non-empty bio with the FUA flag set. - * - * @param vdo The vdo - * @param bio The bio containing an empty flush request - **/ -void launch_vdo_flush(struct vdo *vdo, struct bio *bio); +void vdo_launch_flush(struct vdo *vdo, struct bio *bio); -/** - * Drain the flusher by preventing any more VIOs from entering the flusher and - * then flushing. The flusher will be left in the suspended state. - * - * @param flusher The flusher to drain - * @param completion The completion to finish when the flusher has drained - **/ void vdo_drain_flusher(struct flusher *flusher, struct vdo_completion *completion); -/** - * Resume a flusher which has been suspended. - * - * @param flusher The flusher to resume - * @param parent The completion to finish when the flusher has resumed - **/ void vdo_resume_flusher(struct flusher *flusher, struct vdo_completion *parent); diff --git a/vdo/forest.c b/vdo/forest.c index 9c5d5cd4..b86dca3e 100644 --- a/vdo/forest.c +++ b/vdo/forest.c @@ -1,45 +1,30 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/forest.c#22 $ */ #include "forest.h" +#include + #include "logger.h" -#include "memoryAlloc.h" +#include "memory-alloc.h" -#include "blockMap.h" -#include "blockMapInternals.h" -#include "blockMapPage.h" -#include "blockMapTree.h" -#include "blockMapTreeInternals.h" +#include "block-map.h" +#include "block-map-page.h" +#include "block-map-tree.h" #include "constants.h" -#include "dirtyLists.h" +#include "dirty-lists.h" #include "forest.h" -#include "numUtils.h" -#include "recoveryJournal.h" -#include "slabDepot.h" -#include "slabJournal.h" +#include "io-submitter.h" +#include "num-utils.h" +#include "recovery-journal.h" +#include "slab-depot.h" +#include "slab-journal.h" #include "types.h" -#include "vdoInternal.h" +#include "vdo.h" #include "vio.h" -#include "vioPool.h" +#include "vio-pool.h" enum { BLOCK_MAP_VIO_POOL_SIZE = 64, @@ -88,19 +73,30 @@ struct cursors { struct cursor cursors[]; }; -/**********************************************************************/ -struct tree_page *get_vdo_tree_page_by_index(struct forest *forest, +/** + * *vdo_get_tree_page_by_index() - Get the tree page for a given height and + * page index. + * @forest: The forest which holds the page. + * @root_index: The index of the tree that holds the page. + * @height: The height of the desired page. + * @page_index: The index of the desired page. + * + * Return: The requested page. + */ +struct tree_page *vdo_get_tree_page_by_index(struct forest *forest, root_count_t root_index, height_t height, page_number_t page_index) { page_number_t offset = 0; size_t segment; + for (segment = 0; segment < forest->segments; segment++) { page_number_t border = forest->boundaries[segment].levels[height - 1]; if (page_index < border) { struct block_map_tree *tree = &forest->trees[root_index]; + return &(tree->segments[segment] .levels[height - 1][page_index - offset]); } @@ -110,7 +106,6 @@ struct tree_page *get_vdo_tree_page_by_index(struct forest *forest, return NULL; } -/**********************************************************************/ static int make_segment(struct forest *old_forest, block_count_t new_pages, struct boundary *new_boundary, @@ -196,14 +191,14 @@ static int make_segment(struct forest *old_forest, segment->levels[height] = page_ptr; if (height == (VDO_BLOCK_MAP_TREE_HEIGHT - 1)) { - // Record the root. + /* Record the root. */ struct block_map_page *page = - format_vdo_block_map_page(page_ptr->page_buffer, + vdo_format_block_map_page(page_ptr->page_buffer, forest->map->nonce, VDO_INVALID_PBN, true); page->entries[0] = - pack_vdo_pbn(forest->map->root_origin + root, + vdo_pack_pbn(forest->map->root_origin + root, VDO_MAPPING_STATE_UNCOMPRESSED); } page_ptr += segment_sizes[height]; @@ -213,7 +208,6 @@ static int make_segment(struct forest *old_forest, return VDO_SUCCESS; } -/**********************************************************************/ static void deforest(struct forest *forest, size_t first_page_segment) { root_count_t root; @@ -230,6 +224,7 @@ static void deforest(struct forest *forest, size_t first_page_segment) for (root = 0; root < forest->map->root_count; root++) { struct block_map_tree *tree = &(forest->trees[root]); + UDS_FREE(tree->segments); } @@ -237,8 +232,15 @@ static void deforest(struct forest *forest, size_t first_page_segment) UDS_FREE(forest); } -/**********************************************************************/ -int make_vdo_forest(struct block_map *map, block_count_t entries) +/** + * vdo_make_forest() - Make a collection of trees for a block_map, expanding + * the existing forest if there is one. + * @map: The block map. + * @entries: The number of entries the block map will hold. + * + * Return: VDO_SUCCESS or an error. + */ +int vdo_make_forest(struct block_map *map, block_count_t entries) { struct forest *forest, *old_forest = map->forest; struct boundary new_boundary, *old_boundary = NULL; @@ -276,8 +278,11 @@ int make_vdo_forest(struct block_map *map, block_count_t entries) return VDO_SUCCESS; } -/**********************************************************************/ -void free_vdo_forest(struct forest *forest) +/** + * vdo_free_forest() - Free a forest and all of the segments it contains. + * @forest: The forest to free. + */ +void vdo_free_forest(struct forest *forest) { if (forest == NULL) { return; @@ -286,10 +291,14 @@ void free_vdo_forest(struct forest *forest) deforest(forest, 0); } -/**********************************************************************/ -void abandon_vdo_forest(struct block_map *map) +/** + * vdo_abandon_forest() - Abandon the unused next forest from a block_map. + * @map: The block map. + */ +void vdo_abandon_forest(struct block_map *map) { struct forest *forest = map->next_forest; + map->next_forest = NULL; if (forest != NULL) { deforest(forest, forest->segments - 1); @@ -298,8 +307,12 @@ void abandon_vdo_forest(struct block_map *map) map->next_entry_count = 0; } -/**********************************************************************/ -void replace_vdo_forest(struct block_map *map) +/** + * vdo_replace_forest() - Replace a block_map's forest with the + * already-prepared larger forest. + * @map: The block map. + */ +void vdo_replace_forest(struct block_map *map) { if (map->next_forest != NULL) { if (map->forest != NULL) { @@ -314,11 +327,10 @@ void replace_vdo_forest(struct block_map *map) } /** - * Finish the traversal of a single tree. If it was the last cursor, finish - * the traversal. - * - * @param cursor The cursor doing the traversal - **/ + * finish_cursor() - Finish the traversal of a single tree. If it was the last + * cursor, finish the traversal. + * @cursor: The cursor doing the traversal. + */ static void finish_cursor(struct cursor *cursor) { struct cursors *cursors = cursor->parent; @@ -331,29 +343,29 @@ static void finish_cursor(struct cursor *cursor) UDS_FREE(cursors); - finish_vdo_completion(parent, VDO_SUCCESS); + vdo_finish_completion(parent, VDO_SUCCESS); } -/**********************************************************************/ static void traverse(struct cursor *cursor); /** - * Continue traversing a block map tree. - * - * @param completion The VIO doing a read or write - **/ + * continue_traversal() - Continue traversing a block map tree. + * @completion: The VIO doing a read or write. + */ static void continue_traversal(struct vdo_completion *completion) { struct vio_pool_entry *pool_entry = completion->parent; struct cursor *cursor = pool_entry->parent; + + record_metadata_io_error(as_vio(completion)); traverse(cursor); } /** - * Continue traversing a block map tree now that a page has been loaded. - * - * @param completion The VIO doing the read - **/ + * finish_traversal_load() - Continue traversing a block map tree now that a + * page has been loaded. + * @completion: The VIO doing the read. + */ static void finish_traversal_load(struct vdo_completion *completion) { struct vio_pool_entry *entry = completion->parent; @@ -372,12 +384,23 @@ static void finish_traversal_load(struct vdo_completion *completion) traverse(cursor); } +static void traversal_endio(struct bio *bio) +{ + struct vio *vio = bio->bi_private; + struct vio_pool_entry *entry = vio->completion.parent; + struct cursor *cursor = entry->parent; + + continue_vio_after_io(vio, + finish_traversal_load, + cursor->parent->zone->map_zone->thread_id); +} + /** - * Traverse a single block map tree. This is the recursive heart of the - * traversal process. + * traverse() - Traverse a single block map tree. + * @cursor: The cursor doing the traversal. * - * @param cursor The cursor doing the traversal - **/ + * This is the recursive heart of the traversal process. + */ static void traverse(struct cursor *cursor) { for (; cursor->height < VDO_BLOCK_MAP_TREE_HEIGHT; cursor->height++) { @@ -388,7 +411,7 @@ static void traverse(struct cursor *cursor) .levels[height][level->page_index]); struct block_map_page *page = (struct block_map_page *) tree_page->page_buffer; - if (!is_vdo_block_map_page_initialized(page)) { + if (!vdo_is_block_map_page_initialized(page)) { continue; } @@ -400,12 +423,14 @@ static void traverse(struct cursor *cursor) level->page_index) + level->slot; struct data_location location = - unpack_vdo_block_map_entry(&page->entries[level->slot]); + vdo_unpack_block_map_entry(&page->entries[level->slot]); if (!vdo_is_valid_location(&location)) { - // This entry is invalid, so remove it from the - // page. + /* + * This entry is invalid, so remove it from the + * page. + */ page->entries[level->slot] = - pack_vdo_pbn(VDO_ZERO_BLOCK, + vdo_pack_pbn(VDO_ZERO_BLOCK, VDO_MAPPING_STATE_UNMAPPED); vdo_write_tree_page(tree_page, cursor->parent->zone); @@ -416,11 +441,13 @@ static void traverse(struct cursor *cursor) continue; } - // Erase mapped entries past the end of the logical - // space. + /* + * Erase mapped entries past the end of the logical + * space. + */ if (entry_index >= cursor->boundary.levels[height]) { page->entries[level->slot] = - pack_vdo_pbn(VDO_ZERO_BLOCK, + vdo_pack_pbn(VDO_ZERO_BLOCK, VDO_MAPPING_STATE_UNMAPPED); vdo_write_tree_page(tree_page, cursor->parent->zone); @@ -433,7 +460,7 @@ static void traverse(struct cursor *cursor) cursor->parent->parent); if (result != VDO_SUCCESS) { page->entries[level->slot] = - pack_vdo_pbn(VDO_ZERO_BLOCK, + vdo_pack_pbn(VDO_ZERO_BLOCK, VDO_MAPPING_STATE_UNMAPPED); vdo_write_tree_page(tree_page, cursor->parent->zone); @@ -450,10 +477,11 @@ static void traverse(struct cursor *cursor) next_level->page_index = entry_index; next_level->slot = 0; level->slot++; - launch_read_metadata_vio(cursor->vio_pool_entry->vio, - location.pbn, - finish_traversal_load, - continue_traversal); + submit_metadata_vio(cursor->vio_pool_entry->vio, + location.pbn, + traversal_endio, + continue_traversal, + REQ_OP_READ | REQ_PRIO); return; } } @@ -462,17 +490,17 @@ static void traverse(struct cursor *cursor) } /** - * Start traversing a single block map tree now that the cursor has a VIO with - * which to load pages. - * - *

Implements waiter_callback. + * launch_cursor() - Start traversing a single block map tree now that the + * cursor has a VIO with which to load pages. + * @waiter: The cursor. + * @context: The vio_pool_entry just acquired. * - * @param waiter The cursor - * @param context The vio_pool_entry just acquired - **/ + * Implements waiter_callback. + */ static void launch_cursor(struct waiter *waiter, void *context) { struct cursor *cursor = container_of(waiter, struct cursor, waiter); + cursor->vio_pool_entry = (struct vio_pool_entry *) context; cursor->vio_pool_entry->parent = cursor; vio_as_completion(cursor->vio_pool_entry->vio)->callback_thread_id = @@ -481,13 +509,13 @@ static void launch_cursor(struct waiter *waiter, void *context) } /** - * Compute the number of pages used at each level of the given root's tree. + * compute_boundary() - Compute the number of pages used at each level of the + * given root's tree. + * @map: The block map. + * @root_index: The index of the root to measure. * - * @param map The block map - * @param root_index The index of the root to measure - * - * @return The list of page counts as a boundary structure - **/ + * Return: The list of page counts as a boundary structure. + */ static struct boundary compute_boundary(struct block_map *map, root_count_t root_index) { @@ -495,7 +523,7 @@ static struct boundary compute_boundary(struct block_map *map, height_t height; page_count_t leaf_pages = - compute_vdo_block_map_page_count(map->entry_count); + vdo_compute_block_map_page_count(map->entry_count); /* * Compute the leaf pages for this root. If the number of leaf pages @@ -504,24 +532,32 @@ static struct boundary compute_boundary(struct block_map *map, */ page_count_t last_tree_root = (leaf_pages - 1) % map->root_count; page_count_t level_pages = leaf_pages / map->root_count; + if (root_index <= last_tree_root) { level_pages++; } for (height = 0; height < VDO_BLOCK_MAP_TREE_HEIGHT - 1; height++) { boundary.levels[height] = level_pages; - level_pages = compute_bucket_count(level_pages, - VDO_BLOCK_MAP_ENTRIES_PER_PAGE); + level_pages = DIV_ROUND_UP(level_pages, + VDO_BLOCK_MAP_ENTRIES_PER_PAGE); } - // The root node always exists, even if the root is otherwise unused. + /* The root node always exists, even if the root is otherwise unused. */ boundary.levels[VDO_BLOCK_MAP_TREE_HEIGHT - 1] = 1; return boundary; } -/**********************************************************************/ -void traverse_vdo_forest(struct block_map *map, +/** + * vdo_traverse_forest() - Walk the entire forest of a block map. + * @map: The block map to traverse. + * @callback: A function to call with the pbn of each allocated node in + * the forest. + * @parent: The completion to notify on each traversed PBN, and when + * the traversal is complete. + */ +void vdo_traverse_forest(struct block_map *map, vdo_entry_callback *callback, struct vdo_completion *parent) { @@ -533,12 +569,12 @@ void traverse_vdo_forest(struct block_map *map, __func__, &cursors); if (result != VDO_SUCCESS) { - finish_vdo_completion(parent, result); + vdo_finish_completion(parent, result); return; } cursors->map = map; - cursors->zone = &(vdo_get_block_map_zone(map, 0)->tree_zone); + cursors->zone = &map->zones[0].tree_zone; cursors->pool = cursors->zone->vio_pool; cursors->entry_callback = callback; cursors->parent = parent; diff --git a/vdo/forest.h b/vdo/forest.h index c2b0f637..1c24e32a 100644 --- a/vdo/forest.h +++ b/vdo/forest.h @@ -1,100 +1,41 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/forest.h#7 $ */ #ifndef FOREST_H #define FOREST_H -#include "blockMapTree.h" +#include "block-map-tree.h" #include "types.h" /** - * A function to be called for each allocated PBN when traversing the forest. - * - * @param pbn A PBN of a tree node - * @param completion The parent completion of the traversal + * typedef vdo_entry_callback - A function to be called for each allocated PBN + * when traversing the forest. + * @pbn: A PBN of a tree node. + * @completion: The parent completion of the traversal. * - * @return VDO_SUCCESS or an error - **/ + * Return: VDO_SUCCESS or an error. + */ typedef int vdo_entry_callback(physical_block_number_t pbn, struct vdo_completion *completion); -/** - * Get the tree page for a given height and page index. - * - * @param forest The forest which holds the page - * @param root_index The index of the tree that holds the page - * @param height The height of the desired page - * @param page_index The index of the desired page - * - * @return The requested page - **/ struct tree_page * __must_check -get_vdo_tree_page_by_index(struct forest *forest, +vdo_get_tree_page_by_index(struct forest *forest, root_count_t root_index, height_t height, page_number_t page_index); -/** - * Make a collection of trees for a block_map, expanding the existing forest if - * there is one. - * - * @param map The block map - * @param entries The number of entries the block map will hold - * - * @return VDO_SUCCESS or an error - **/ -int __must_check make_vdo_forest(struct block_map *map, block_count_t entries); +int __must_check vdo_make_forest(struct block_map *map, block_count_t entries); -/** - * Free a forest and all of the segments it contains. - * - * @param forest The forest to free - **/ -void free_vdo_forest(struct forest *forest); +void vdo_free_forest(struct forest *forest); -/** - * Abandon the unused next forest from a block_map. - * - * @param map The block map - **/ -void abandon_vdo_forest(struct block_map *map); +void vdo_abandon_forest(struct block_map *map); -/** - * Replace a block_map's forest with the already-prepared larger forest. - * - * @param map The block map - **/ -void replace_vdo_forest(struct block_map *map); +void vdo_replace_forest(struct block_map *map); -/** - * Walk the entire forest of a block map. - * - * @param map The block map to traverse - * @param callback A function to call with the pbn of each allocated node in - * the forest - * @param parent The completion to notify on each traversed PBN, and when - * the traversal is complete - **/ -void traverse_vdo_forest(struct block_map *map, +void vdo_traverse_forest(struct block_map *map, vdo_entry_callback *callback, struct vdo_completion *parent); -#endif // FOREST_H +#endif /* FOREST_H */ diff --git a/vdo/funnel-queue.c b/vdo/funnel-queue.c new file mode 100644 index 00000000..e4d37319 --- /dev/null +++ b/vdo/funnel-queue.c @@ -0,0 +1,225 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright Red Hat + */ + +#include "funnel-queue.h" + +#include "memory-alloc.h" +#include "permassert.h" +#include "uds.h" + +/** + * A funnel queue is a simple (almost) lock-free queue that accepts entries + * from multiple threads (multi-producer) and delivers them to a single thread + * (single-consumer). "Funnel" is an attempt to evoke the image of requests + * from more than one producer being "funneled down" to a single consumer. + * + * This is an unsynchronized but thread-safe data structure when used as + * intended. There is no mechanism to ensure that only one thread is consuming + * from the queue. If more than one thread attempts to comsume from the queue, + * the resulting behavior is undefined. Clients must not directly access or + * manipulate the internals of the queue, which are only exposed for the + * purpose of allowing the very simple enqueue operation to be inlined. + * + * The implementation requires that a funnel_queue_entry structure (a link + * pointer) is embedded in the queue entries, and pointers to those structures + * are used exclusively by the queue. No macros are defined to template the + * queue, so the offset of the funnel_queue_entry in the records placed in the + * queue must all be the same so the client can derive their structure pointer + * from the entry pointer returned by funnel_queue_poll(). + * + * Callers are wholly responsible for allocating and freeing the entries. + * Entries may be freed as soon as they are returned since this queue is not + * susceptible to the "ABA problem" present in many lock-free data structures. + * The queue is dynamically allocated to ensure cache-line alignment, but no + * other dynamic allocation is used. + * + * The algorithm is not actually 100% lock-free. There is a single point in + * funnel_queue_put() at which a preempted producer will prevent the consumers + * from seeing items added to the queue by later producers, and only if the + * queue is short enough or the consumer fast enough for it to reach what was + * the end of the queue at the time of the preemption. + * + * The consumer function, funnel_queue_poll(), will return NULL when the queue + * is empty. To wait for data to consume, spin (if safe) or combine the queue + * with a struct event_count to signal the presence of new entries. + **/ + +int make_funnel_queue(struct funnel_queue **queue_ptr) +{ + int result; + struct funnel_queue *queue; + + result = UDS_ALLOCATE(1, struct funnel_queue, "funnel queue", &queue); + if (result != UDS_SUCCESS) { + return result; + } + + /* + * Initialize the stub entry and put it in the queue, establishing the + * invariant that queue->newest and queue->oldest are never null. + */ + queue->stub.next = NULL; + queue->newest = &queue->stub; + queue->oldest = &queue->stub; + + *queue_ptr = queue; + return UDS_SUCCESS; +} + +void free_funnel_queue(struct funnel_queue *queue) +{ + UDS_FREE(queue); +} + +static struct funnel_queue_entry *get_oldest(struct funnel_queue *queue) +{ + /* + * Barrier requirements: We need a read barrier between reading a + * "next" field pointer value and reading anything it points to. + * There's an accompanying barrier in funnel_queue_put() between its + * caller setting up the entry and making it visible. + */ + struct funnel_queue_entry *oldest = queue->oldest; + struct funnel_queue_entry *next = oldest->next; + + if (oldest == &queue->stub) { + /* + * When the oldest entry is the stub and it has no successor, + * the queue is logically empty. + */ + if (next == NULL) { + return NULL; + } + /* + * The stub entry has a successor, so the stub can be dequeued + * and ignored without breaking the queue invariants. + */ + oldest = next; + queue->oldest = oldest; + /* + * FIXME: Some platforms such as Alpha may require an + * additional barrier here. See + * https://lkml.org/lkml/2019/11/8/1021 + */ + next = oldest->next; + } + + /* + * We have a non-stub candidate to dequeue. If it lacks a successor, + * we'll need to put the stub entry back on the queue first. + */ + if (next == NULL) { + struct funnel_queue_entry *newest = queue->newest; + + if (oldest != newest) { + /* + * Another thread has already swung queue->newest + * atomically, but not yet assigned previous->next. The + * queue is really still empty. + */ + return NULL; + } + + /* + * Put the stub entry back on the queue, ensuring a successor + * will eventually be seen. + */ + funnel_queue_put(queue, &queue->stub); + + /* Check again for a successor. */ + next = oldest->next; + if (next == NULL) { + /* + * We lost a race with a producer who swapped + * queue->newest before we did, but who hasn't yet + * updated previous->next. Try again later. + */ + return NULL; + } + } + + return oldest; +} + +/* + * Poll a queue, removing the oldest entry if the queue is not empty. This + * function must only be called from a single consumer thread. + */ +struct funnel_queue_entry *funnel_queue_poll(struct funnel_queue *queue) +{ + struct funnel_queue_entry *oldest = get_oldest(queue); + + if (oldest == NULL) { + return oldest; + } + + /* + * Dequeue the oldest entry and return it. Only one consumer thread may + * call this function, so no locking, atomic operations, or fences are + * needed; queue->oldest is owned by the consumer and oldest->next is + * never used by a producer thread after it is swung from NULL to + * non-NULL. + */ + queue->oldest = oldest->next; + /* + * Make sure the caller sees the proper stored data for this entry. + * Since we've already fetched the entry pointer we stored in + * "queue->oldest", this also ensures that on entry to the next call + * we'll properly see the dependent data. + */ + smp_rmb(); + /* + * If "oldest" is a very light-weight work item, we'll be looking + * for the next one very soon, so prefetch it now. + */ + prefetch_address(queue->oldest, true); + oldest->next = NULL; + return oldest; +} + +/* + * Check whether the funnel queue is empty or not. If the queue is in a + * transition state with one or more entries being added such that the list + * view is incomplete, this function will report the queue as empty. + */ +bool is_funnel_queue_empty(struct funnel_queue *queue) +{ + return get_oldest(queue) == NULL; +} + +/* + * Check whether the funnel queue is idle or not. If the queue has entries + * available to be retrieved, it is not idle. If the queue is in a transition + * state with one or more entries being added such that the list view is + * incomplete, it may not be possible to retrieve an entry with the + * funnel_queue_poll() function, but the queue will not be considered idle. + */ +bool is_funnel_queue_idle(struct funnel_queue *queue) +{ + /* + * Oldest is not the stub, so there's another entry, though if next is + * NULL we can't retrieve it yet. + */ + if (queue->oldest != &queue->stub) { + return false; + } + + /* + * Oldest is the stub, but newest has been updated by _put(); either + * there's another, retrievable entry in the list, or the list is + * officially empty but in the intermediate state of having an entry + * added. + * + * Whether anything is retrievable depends on whether stub.next has + * been updated and become visible to us, but for idleness we don't + * care. And due to memory ordering in _put(), the update to newest + * would be visible to us at the same time or sooner. + */ + if (queue->newest != &queue->stub) { + return false; + } + + return true; +} diff --git a/vdo/funnel-queue.h b/vdo/funnel-queue.h new file mode 100644 index 00000000..005825e9 --- /dev/null +++ b/vdo/funnel-queue.h @@ -0,0 +1,100 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright Red Hat + */ + +#ifndef FUNNEL_QUEUE_H +#define FUNNEL_QUEUE_H + +#include + +#include "compiler.h" +#include "cpu.h" +#include "type-defs.h" + +/* This queue link structure must be embedded in client entries. */ +struct funnel_queue_entry { + /* The next (newer) entry in the queue. */ + struct funnel_queue_entry *volatile next; +}; + +/* + * The dynamically allocated queue structure, which is allocated on a cache + * line boundary so the producer and consumer fields in the structure will land + * on separate cache lines. This should be consider opaque but it is exposed + * here so funnel_queue_put() can be inlined. + */ +struct __attribute__((aligned(CACHE_LINE_BYTES))) funnel_queue { + /* + * The producers' end of the queue, an atomically exchanged pointer + * that will never be NULL. + */ + struct funnel_queue_entry *volatile newest; + + /* + * The consumer's end of the queue, which is owned by the consumer and + * never NULL. + */ + struct funnel_queue_entry *oldest + __attribute__((aligned(CACHE_LINE_BYTES))); + + /* A dummy entry used to provide the non-NULL invariants above. */ + struct funnel_queue_entry stub; +}; + +int __must_check make_funnel_queue(struct funnel_queue **queue_ptr); + +void free_funnel_queue(struct funnel_queue *queue); + +/* + * Put an entry on the end of the queue. + * + * The entry pointer must be to the struct funnel_queue_entry embedded in the + * caller's data structure. The caller must be able to derive the address of + * the start of their data structure from the pointer that passed in here, so + * every entry in the queue must have the struct funnel_queue_entry at the same + * offset within the client's structure. + */ +static INLINE void funnel_queue_put(struct funnel_queue *queue, + struct funnel_queue_entry *entry) +{ + struct funnel_queue_entry *previous; + + /* + * Barrier requirements: All stores relating to the entry ("next" + * pointer, containing data structure fields) must happen before the + * previous->next store making it visible to the consumer. Also, the + * entry's "next" field initialization to NULL must happen before any + * other producer threads can see the entry (the xchg) and try to + * update the "next" field. + * + * xchg implements a full barrier. + */ + entry->next = NULL; + /* + * The xchg macro in the PPC kernel calls a function that takes a void* + * argument, triggering a warning about dropping the volatile + * qualifier. + */ +#pragma GCC diagnostic push +#if __GNUC__ >= 5 +#pragma GCC diagnostic ignored "-Wdiscarded-qualifiers" +#endif + previous = xchg(&queue->newest, entry); +#pragma GCC diagnostic pop + /* + * Preemptions between these two statements hide the rest of the queue + * from the consumer, preventing consumption until the following + * assignment runs. + */ + previous->next = entry; +} + +struct funnel_queue_entry *__must_check +funnel_queue_poll(struct funnel_queue *queue); + +bool __must_check is_funnel_queue_empty(struct funnel_queue *queue); + +bool __must_check is_funnel_queue_idle(struct funnel_queue *queue); + +#endif /* FUNNEL_QUEUE_H */ diff --git a/uds/geometry.c b/vdo/geometry.c similarity index 52% rename from uds/geometry.c rename to vdo/geometry.c index e38d1b19..06713a18 100644 --- a/uds/geometry.c +++ b/vdo/geometry.c @@ -1,58 +1,77 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/krusty/src/uds/geometry.c#17 $ */ #include "geometry.h" -#include "deltaIndex.h" +#include "delta-index.h" #include "errors.h" -#include "hashUtils.h" +#include "hash-utils.h" #include "logger.h" -#include "memoryAlloc.h" +#include "memory-alloc.h" #include "permassert.h" #include "uds.h" -/**********************************************************************/ -static int initialize_geometry(struct geometry *geometry, - size_t bytes_per_page, - unsigned int record_pages_per_chapter, - unsigned int chapters_per_volume, - unsigned int sparse_chapters_per_volume, - uint64_t remapped_virtual, - uint64_t remapped_physical) +/* + * The geometry records parameters that define the layout of a UDS index + * volume, and the size and shape of various index structures. +* + * An index volume is divided into a fixed number of fixed-size chapters, each + * consisting of a fixed number of fixed-size pages. The volume layout is + * defined by two constants and four parameters. The constants are that index + * records are 32 bytes long (16-byte block name plus 16-byte metadata) and + * that open chapter index hash slots are one byte long. The four parameters + * are the number of bytes in a page, the number of record pages in a chapter, + * the number of chapters in a volume, and the number of chapters that are + * sparse. From these parameters, we can derive the rest of the layout and + * other index properties. + * + * The index volume is sized by its maximum memory footprint. For a dense + * index, the persistent storage is about 10 times the size of the memory + * footprint. For a sparse index, the persistent storage is about 100 times + * the size of the memory footprint. + * + * For a small index with a memory footprint less than 1GB, there are three + * possible memory configurations: 0.25GB, 0.5GB and 0.75GB. The default + * geometry for each is 1024 index records per 32 KB page, 1024 chapters per + * volume, and either 64, 128, or 192 record pages per chapter (resulting in 6, + * 13, or 20 index pages per chapter) depending on the memory configuration. + * For the VDO default of a 0.25 GB index, this yields a deduplication window + * of 256 GB using about 2.5 GB for the persistent storage and 256 MB of RAM. + * + * For a larger index with a memory footprint that is a multiple of 1 GB, the + * geometry is 1024 index records per 32 KB page, 256 record pages per chapter, + * 26 index pages per chapter, and 1024 chapters for every GB of memory + * footprint. For a 1 GB volume, this yields a deduplication window of 1 TB + * using about 9GB of persistent storage and 1 GB of RAM. + * + * The above numbers hold for volumes which have no sparse chapters. A sparse + * volume has 10 times as many chapters as the corresponding non-sparse volume, + * which provides 10 times the deduplication window while using 10 times as + * much persistent storage as the equivalent non-sparse volume with the same + * memory footprint. + * + * If the volume has been converted from a non-lvm format to an lvm volume, the + * number of chapters per volume will have been reduced by one by eliminating + * physical chapter 0, and the virtual chapter that formerly mapped to physical + * chapter 0 may be remapped to another physical chapter. This remapping is + * expressed by storing which virtual chapter was remapped, and which physical + * chapter it was moved to. + **/ + +int make_geometry(size_t bytes_per_page, + unsigned int record_pages_per_chapter, + unsigned int chapters_per_volume, + unsigned int sparse_chapters_per_volume, + uint64_t remapped_virtual, + uint64_t remapped_physical, + struct geometry **geometry_ptr) { - int result = - ASSERT_WITH_ERROR_CODE(bytes_per_page >= BYTES_PER_RECORD, - UDS_BAD_STATE, - "page is smaller than a record: %zu", - bytes_per_page); - if (result != UDS_SUCCESS) { - return result; - } + int result; + struct geometry *geometry; - result = ASSERT_WITH_ERROR_CODE(chapters_per_volume > - sparse_chapters_per_volume, - UDS_INVALID_ARGUMENT, - "sparse chapters per volume (%u) must be less than chapters per volume (%u)", - sparse_chapters_per_volume, - chapters_per_volume); + result = UDS_ALLOCATE(1, struct geometry, "geometry", &geometry); if (result != UDS_SUCCESS) { return result; } @@ -66,33 +85,31 @@ static int initialize_geometry(struct geometry *geometry, geometry->remapped_virtual = remapped_virtual; geometry->remapped_physical = remapped_physical; - // Calculate the number of records in a page, chapter, and volume. geometry->records_per_page = bytes_per_page / BYTES_PER_RECORD; geometry->records_per_chapter = geometry->records_per_page * record_pages_per_chapter; geometry->records_per_volume = - (unsigned long) geometry->records_per_chapter * - chapters_per_volume; + ((unsigned long) geometry->records_per_chapter * + chapters_per_volume); geometry->open_chapter_load_ratio = DEFAULT_OPEN_CHAPTER_LOAD_RATIO; - // Initialize values for delta chapter indexes. geometry->chapter_mean_delta = 1 << DEFAULT_CHAPTER_MEAN_DELTA_BITS; geometry->chapter_payload_bits = compute_bits(record_pages_per_chapter - 1); - // We want 1 delta list for every 64 records in the chapter. - // The "| 077" ensures that the chapter_delta_list_bits computation - // does not underflow. + /* + * We want 1 delta list for every 64 records in the chapter. + * The "| 077" ensures that the chapter_delta_list_bits computation + * does not underflow. + */ geometry->chapter_delta_list_bits = compute_bits((geometry->records_per_chapter - 1) | 077) - 6; geometry->delta_lists_per_chapter = 1 << geometry->chapter_delta_list_bits; - // We need enough address bits to achieve the desired mean delta. + /* We need enough address bits to achieve the desired mean delta. */ geometry->chapter_address_bits = (DEFAULT_CHAPTER_MEAN_DELTA_BITS - geometry->chapter_delta_list_bits + compute_bits(geometry->records_per_chapter - 1)); - // Let the delta index code determine how many pages are needed for the - // index geometry->index_pages_per_chapter = get_delta_index_page_count(geometry->records_per_chapter, geometry->delta_lists_per_chapter, @@ -100,8 +117,6 @@ static int initialize_geometry(struct geometry *geometry, geometry->chapter_payload_bits, bytes_per_page); - // Now that we have the size of a chapter index, we can calculate the - // space used by chapters and volumes. geometry->pages_per_chapter = geometry->index_pages_per_chapter + record_pages_per_chapter; geometry->pages_per_volume = @@ -110,43 +125,11 @@ static int initialize_geometry(struct geometry *geometry, geometry->bytes_per_volume = bytes_per_page * (geometry->pages_per_volume + geometry->header_pages_per_volume); - geometry->bytes_per_chapter = - bytes_per_page * geometry->pages_per_chapter; - - return UDS_SUCCESS; -} - -/**********************************************************************/ -int make_geometry(size_t bytes_per_page, - unsigned int record_pages_per_chapter, - unsigned int chapters_per_volume, - unsigned int sparse_chapters_per_volume, - uint64_t remapped_virtual, - uint64_t remapped_physical, - struct geometry **geometry_ptr) -{ - struct geometry *geometry; - int result = UDS_ALLOCATE(1, struct geometry, "geometry", &geometry); - if (result != UDS_SUCCESS) { - return result; - } - result = initialize_geometry(geometry, - bytes_per_page, - record_pages_per_chapter, - chapters_per_volume, - sparse_chapters_per_volume, - remapped_virtual, - remapped_physical); - if (result != UDS_SUCCESS) { - free_geometry(geometry); - return result; - } *geometry_ptr = geometry; return UDS_SUCCESS; } -/**********************************************************************/ int copy_geometry(struct geometry *source, struct geometry **geometry_ptr) { return make_geometry(source->bytes_per_page, @@ -158,28 +141,27 @@ int copy_geometry(struct geometry *source, struct geometry **geometry_ptr) geometry_ptr); } -/**********************************************************************/ void free_geometry(struct geometry *geometry) { UDS_FREE(geometry); } -/**********************************************************************/ unsigned int __must_check map_to_physical_chapter(const struct geometry *geometry, uint64_t virtual_chapter) { uint64_t delta; + if (!is_reduced_geometry(geometry)) { - return (virtual_chapter % geometry->chapters_per_volume); + return virtual_chapter % geometry->chapters_per_volume; } if (likely(virtual_chapter > geometry->remapped_virtual)) { delta = virtual_chapter - geometry->remapped_virtual; if (likely(delta > geometry->remapped_physical)) { - return (delta % geometry->chapters_per_volume); + return delta % geometry->chapters_per_volume; } else { - return (delta - 1); + return delta - 1; } } @@ -189,24 +171,23 @@ map_to_physical_chapter(const struct geometry *geometry, delta = geometry->remapped_virtual - virtual_chapter; if (delta < geometry->chapters_per_volume) { - return (geometry->chapters_per_volume - delta); + return geometry->chapters_per_volume - delta; } - // This chapter is so old the answer doesn't matter. + /* This chapter is so old the answer doesn't matter. */ return 0; } -/**********************************************************************/ +/* Check whether any sparse chapters are in use. */ bool has_sparse_chapters(const struct geometry *geometry, uint64_t oldest_virtual_chapter, uint64_t newest_virtual_chapter) { - return (is_sparse(geometry) && + return (is_sparse_geometry(geometry) && ((newest_virtual_chapter - oldest_virtual_chapter + 1) > geometry->dense_chapters_per_volume)); } -/**********************************************************************/ bool is_chapter_sparse(const struct geometry *geometry, uint64_t oldest_virtual_chapter, uint64_t newest_virtual_chapter, @@ -220,34 +201,38 @@ bool is_chapter_sparse(const struct geometry *geometry, newest_virtual_chapter)); } -/**********************************************************************/ +/* Calculate how many chapters to expire after opening the newest chapter. */ unsigned int chapters_to_expire(const struct geometry *geometry, uint64_t newest_chapter) { - // If the index isn't full yet, don't expire anything. + /* If the index isn't full yet, don't expire anything. */ if (newest_chapter < geometry->chapters_per_volume) { return 0; } - // If a chapter is out of order... + /* If a chapter is out of order... */ if (geometry->remapped_physical > 0) { uint64_t oldest_chapter = newest_chapter - geometry->chapters_per_volume; - // ... expire an extra chapter when expiring the moved chapter - // to free physical space for the new chapter ... + /* + * ... expire an extra chapter when expiring the moved chapter + * to free physical space for the new chapter ... + */ if (oldest_chapter == geometry->remapped_virtual) { return 2; } - // ... but don't expire anything when the new chapter will use - // the physical chapter freed by expiring the moved chapter. + /* + * ... but don't expire anything when the new chapter will use + * the physical chapter freed by expiring the moved chapter. + */ if (oldest_chapter == (geometry->remapped_virtual + geometry->remapped_physical)) { return 0; } } - // Normally, just expire one. + /* Normally, just expire one. */ return 1; } diff --git a/vdo/geometry.h b/vdo/geometry.h new file mode 100644 index 00000000..54d2d1c9 --- /dev/null +++ b/vdo/geometry.h @@ -0,0 +1,143 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright Red Hat + */ + +#ifndef GEOMETRY_H +#define GEOMETRY_H 1 + +#include "compiler.h" +#include "type-defs.h" +#include "uds.h" + +struct geometry { + /* Size of a chapter page, in bytes */ + size_t bytes_per_page; + /* Number of record pages in a chapter */ + unsigned int record_pages_per_chapter; + /* Total number of chapters in a volume */ + unsigned int chapters_per_volume; + /* Number of sparsely-indexed chapters in a volume */ + unsigned int sparse_chapters_per_volume; + /* Number of bits used to determine delta list numbers */ + unsigned int chapter_delta_list_bits; + /* Virtual chapter remapped from physical chapter 0 */ + uint64_t remapped_virtual; + /* New physical chapter where the remapped chapter can be found */ + uint64_t remapped_physical; + + /* + * The following properties are derived from the ones above, but they + * are computed and recorded as fields for convenience. + */ + /* Total number of pages in a volume, excluding the header */ + unsigned int pages_per_volume; + /* Total number of header pages per volume */ + unsigned int header_pages_per_volume; + /* Total number of bytes in a volume, including the header */ + size_t bytes_per_volume; + /* Number of pages in a chapter */ + unsigned int pages_per_chapter; + /* Number of index pages in a chapter index */ + unsigned int index_pages_per_chapter; + /* The minimum ratio of hash slots to records in an open chapter */ + unsigned int open_chapter_load_ratio; + /* Number of records that fit on a page */ + unsigned int records_per_page; + /* Number of records that fit in a chapter */ + unsigned int records_per_chapter; + /* Number of records that fit in a volume */ + uint64_t records_per_volume; + /* Number of delta lists per chapter index */ + unsigned int delta_lists_per_chapter; + /* Mean delta for chapter indexes */ + unsigned int chapter_mean_delta; + /* Number of bits needed for record page numbers */ + unsigned int chapter_payload_bits; + /* Number of bits used to compute addresses for chapter delta lists */ + unsigned int chapter_address_bits; + /* Number of densely-indexed chapters in a volume */ + unsigned int dense_chapters_per_volume; +}; + +enum { + /* The number of bytes in a record (name + metadata) */ + BYTES_PER_RECORD = (UDS_CHUNK_NAME_SIZE + UDS_METADATA_SIZE), + + /* The default length of a page in a chapter, in bytes */ + DEFAULT_BYTES_PER_PAGE = 1024 * BYTES_PER_RECORD, + + /* The default maximum number of records per page */ + DEFAULT_RECORDS_PER_PAGE = DEFAULT_BYTES_PER_PAGE / BYTES_PER_RECORD, + + /* The default number of record pages in a chapter */ + DEFAULT_RECORD_PAGES_PER_CHAPTER = 256, + + /* The default number of record pages in a chapter for a small index */ + SMALL_RECORD_PAGES_PER_CHAPTER = 64, + + /* The default number of chapters in a volume */ + DEFAULT_CHAPTERS_PER_VOLUME = 1024, + + /* The default number of sparsely-indexed chapters in a volume */ + DEFAULT_SPARSE_CHAPTERS_PER_VOLUME = 0, + + /* The log2 of the default mean delta */ + DEFAULT_CHAPTER_MEAN_DELTA_BITS = 16, + + /* The log2 of the number of delta lists in a large chapter */ + DEFAULT_CHAPTER_DELTA_LIST_BITS = 12, + + /* The log2 of the number of delta lists in a small chapter */ + SMALL_CHAPTER_DELTA_LIST_BITS = 10, + + /* The default minimum ratio of slots to records in an open chapter */ + DEFAULT_OPEN_CHAPTER_LOAD_RATIO = 2, +}; + +int __must_check make_geometry(size_t bytes_per_page, + unsigned int record_pages_per_chapter, + unsigned int chapters_per_volume, + unsigned int sparse_chapters_per_volume, + uint64_t remapped_virtual, + uint64_t remapped_physical, + struct geometry **geometry_ptr); + +int __must_check copy_geometry(struct geometry *source, + struct geometry **geometry_ptr); + +void free_geometry(struct geometry *geometry); + +unsigned int __must_check +map_to_physical_chapter(const struct geometry *geometry, + uint64_t virtual_chapter); + +/* + * Check whether this geometry is reduced by a chapter. This will only be true + * if the volume was converted from a non-lvm volume to an lvm volume. + */ +static INLINE bool __must_check +is_reduced_geometry(const struct geometry *geometry) +{ + return !!(geometry->chapters_per_volume & 1); +} + +static INLINE bool __must_check +is_sparse_geometry(const struct geometry *geometry) +{ + return (geometry->sparse_chapters_per_volume > 0); +} + +bool __must_check has_sparse_chapters(const struct geometry *geometry, + uint64_t oldest_virtual_chapter, + uint64_t newest_virtual_chapter); + +bool __must_check is_chapter_sparse(const struct geometry *geometry, + uint64_t oldest_virtual_chapter, + uint64_t newest_virtual_chapter, + uint64_t virtual_chapter_number); + +unsigned int __must_check chapters_to_expire(const struct geometry *geometry, + uint64_t newest_chapter); + +#endif /* GEOMETRY_H */ diff --git a/vdo/hashLock.c b/vdo/hash-lock.c similarity index 54% rename from vdo/hashLock.c rename to vdo/hash-lock.c index fd53d8aa..ac9e7a39 100644 --- a/vdo/hashLock.c +++ b/vdo/hash-lock.c @@ -1,28 +1,14 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/hashLock.c#39 $ */ /** + * DOC: + * * A hash_lock controls and coordinates writing, index access, and dedupe among * groups of data_vios concurrently writing identical blocks, allowing them to - * deduplicate not only against advice but also against each other. This save + * deduplicate not only against advice but also against each other. This saves * on index queries and allows those data_vios to concurrently deduplicate * against a single block instead of being serialized through a PBN read lock. * Only one index query is needed for each hash_lock, instead of one for every @@ -101,28 +87,29 @@ * transitioned to after LOCKING. **/ -#include "hashLock.h" -#include "hashLockInternals.h" +#include "hash-lock.h" #include #include "logger.h" -#include "memoryAlloc.h" +#include "memory-alloc.h" #include "permassert.h" -#include "compressionState.h" +#include "compression-state.h" #include "constants.h" -#include "dataVIO.h" -#include "hashZone.h" +#include "data-vio.h" +#include "dedupe-index.h" +#include "hash-zone.h" +#include "io-submitter.h" #include "packer.h" -#include "pbnLock.h" -#include "physicalZone.h" +#include "pbn-lock.h" +#include "physical-zone.h" #include "slab.h" -#include "slabDepot.h" +#include "slab-depot.h" #include "types.h" -#include "vdoInternal.h" -#include "vioWrite.h" -#include "waitQueue.h" +#include "vdo.h" +#include "vio-write.h" +#include "wait-queue.h" static const char *LOCK_STATE_NAMES[] = { [VDO_HASH_LOCK_BYPASSING] = "BYPASSING", @@ -137,16 +124,25 @@ static const char *LOCK_STATE_NAMES[] = { [VDO_HASH_LOCK_WRITING] = "WRITING", }; -// There are loops in the state diagram, so some forward decl's are needed. -static void start_deduping(struct hash_lock *lock, struct data_vio *agent, +/* There are loops in the state diagram, so some forward decl's are needed. */ +static void start_deduping(struct hash_lock *lock, + struct data_vio *agent, bool agent_is_done); static void start_locking(struct hash_lock *lock, struct data_vio *agent); static void start_writing(struct hash_lock *lock, struct data_vio *agent); static void unlock_duplicate_pbn(struct vdo_completion *completion); static void transfer_allocation_lock(struct data_vio *data_vio); -/**********************************************************************/ -struct pbn_lock *get_vdo_duplicate_lock(struct data_vio *data_vio) +/** + * vdo_get_duplicate_lock() - Get the PBN lock on the duplicate data + * location for a data_vio from the + * hash_lock the data_vio holds (if there + * is one). + * @data_vio: The data_vio to query. + * + * Return: The PBN lock on the data_vio's duplicate location. + **/ +struct pbn_lock *vdo_get_duplicate_lock(struct data_vio *data_vio) { if (data_vio->hash_lock == NULL) { return NULL; @@ -154,68 +150,74 @@ struct pbn_lock *get_vdo_duplicate_lock(struct data_vio *data_vio) return data_vio->hash_lock->duplicate_lock; } -/**********************************************************************/ -const char *get_vdo_hash_lock_state_name(enum hash_lock_state state) +/** + * vdo_get_hash_lock_state_name() - Get the string representation of a + * hash lock state. + * @state: The hash lock state. + * + * Return: The short string representing the state + **/ +const char *vdo_get_hash_lock_state_name(enum hash_lock_state state) { - // Catch if a state has been added without updating the name array. - STATIC_ASSERT((VDO_HASH_LOCK_DESTROYING + 1) == COUNT_OF(LOCK_STATE_NAMES)); - return (state < COUNT_OF(LOCK_STATE_NAMES)) ? LOCK_STATE_NAMES[state] - : NULL; + /* Catch if a state has been added without updating the name array. */ + STATIC_ASSERT((VDO_HASH_LOCK_DESTROYING + 1) + == ARRAY_SIZE(LOCK_STATE_NAMES)); + return (state < ARRAY_SIZE(LOCK_STATE_NAMES)) ? LOCK_STATE_NAMES[state] + : NULL; } /** - * Set the current state of a hash lock. - * - * @param lock The lock to update - * @param new_state The new state - **/ + * set_hash_lock_state() - Set the current state of a hash lock. + * @lock: The lock to update. + * @new_state: The new state. + */ static void set_hash_lock_state(struct hash_lock *lock, enum hash_lock_state new_state) { if (false) { uds_log_warning("XXX %px %s -> %s", (void *) lock, - get_vdo_hash_lock_state_name(lock->state), - get_vdo_hash_lock_state_name(new_state)); + vdo_get_hash_lock_state_name(lock->state), + vdo_get_hash_lock_state_name(new_state)); } lock->state = new_state; } /** - * Assert that a data_vio is the agent of its hash lock, and that this is being - * called in the hash zone. - * - * @param data_vio The data_vio expected to be the lock agent - * @param where A string describing the function making the assertion - **/ -static void assert_hash_lock_agent(struct data_vio *data_vio, const char *where) + * assert_hash_lock_agent() - Assert that a data_vio is the agent of + * its hash lock, and that this is being + * called in the hash zone. + * @data_vio: The data_vio expected to be the lock agent. + * @where: A string describing the function making the assertion. + */ +static void assert_hash_lock_agent(struct data_vio *data_vio, + const char *where) { - // Not safe to access the agent field except from the hash zone. + /* Not safe to access the agent field except from the hash zone. */ assert_data_vio_in_hash_zone(data_vio); ASSERT_LOG_ONLY(data_vio == data_vio->hash_lock->agent, "%s must be for the hash lock agent", where); } /** - * Set or clear the lock agent. - * - * @param lock The hash lock to update - * @param new_agent The new lock agent (may be NULL to clear the agent) - **/ + * set_agent() - Set or clear the lock agent. + * @lock: The hash lock to update. + * @new_agent: The new lock agent (may be NULL to clear the agent). + */ static void set_agent(struct hash_lock *lock, struct data_vio *new_agent) { lock->agent = new_agent; } /** - * Set the duplicate lock held by a hash lock. May only be called in the - * physical zone of the PBN lock. - * - * @param hash_lock The hash lock to update - * @param pbn_lock The PBN read lock to use as the duplicate lock - **/ + * set_duplicate_lock() - Set the duplicate lock held by a hash lock. + * May only be called in the physical zone of + * the PBN lock. + * @hash_lock: The hash lock to update. + * @pbn_lock: The PBN read lock to use as the duplicate lock. + */ static void set_duplicate_lock(struct hash_lock *hash_lock, - struct pbn_lock *pbn_lock) + struct pbn_lock *pbn_lock) { ASSERT_LOG_ONLY((hash_lock->duplicate_lock == NULL), "hash lock must not already hold a duplicate lock"); @@ -225,39 +227,41 @@ static void set_duplicate_lock(struct hash_lock *hash_lock, } /** - * Convert a pointer to the hash_lock_entry field in a data_vio to the - * enclosing data_vio. + * data_vio_from_lock_entry() - Convert a pointer to the + * hash_lock_entry field in a data_vio to + * the enclosing data_vio. + * @entry: The list entry to convert. * - * @param entry The list entry to convert - * - * @return A pointer to the data_vio containing the list entry - **/ -static inline struct data_vio *data_vio_from_lock_entry(struct list_head *entry) + * Return: A pointer to the data_vio containing the list entry. + */ +static inline struct data_vio * +data_vio_from_lock_entry(struct list_head *entry) { return list_entry(entry, struct data_vio, hash_lock_entry); } /** - * Remove the first data_vio from the lock's wait queue and return it. - * - * @param lock The lock containing the wait queue + * dequeue_lock_waiter() - Remove the first data_vio from the lock's + * wait queue and return it. + * @lock: The lock containing the wait queue. * - * @return The first (oldest) waiter in the queue, or NULL if - * the queue is empty - **/ + * Return: The first (oldest) waiter in the queue, or NULL if + * the queue is empty. + */ static inline struct data_vio *dequeue_lock_waiter(struct hash_lock *lock) { return waiter_as_data_vio(dequeue_next_waiter(&lock->waiters)); } /** - * Continue processing a data_vio that has been waiting for an event, setting - * the result from the event, and continuing in a specified callback function. - * - * @param data_vio The data_vio to continue - * @param result The current result (will not mask older errors) - * @param callback The function in which to continue processing - **/ + * continue_data_vio_in() - Continue processing a data_vio that has + * been waiting for an event, setting the + * result from the event, and continuing in a + * specified callback function. + * @data_vio: The data_vio to continue. + * @result: The current result (will not mask older errors). + * @callback: The function in which to continue processing. + */ static void continue_data_vio_in(struct data_vio *data_vio, int result, vdo_action *callback) @@ -267,15 +271,18 @@ static void continue_data_vio_in(struct data_vio *data_vio, } /** - * Set, change, or clear the hash lock a data_vio is using. Updates the hash - * lock (or locks) to reflect the change in membership. + * set_hash_lock() - Set, change, or clear the hash lock a data_vio is + * using. + * @data_vio: The data_vio to update. + * @new_lock: The hash lock the data_vio is joining. * - * @param data_vio The data_vio to update - * @param new_lock The hash lock the data_vio is joining - **/ -static void set_hash_lock(struct data_vio *data_vio, struct hash_lock *new_lock) + * Updates the hash lock (or locks) to reflect the change in membership. + */ +static void set_hash_lock(struct data_vio *data_vio, + struct hash_lock *new_lock) { struct hash_lock *old_lock = data_vio->hash_lock; + if (old_lock != NULL) { ASSERT_LOG_ONLY( data_vio->hash_zone != NULL, @@ -288,11 +295,14 @@ static void set_hash_lock(struct data_vio *data_vio, struct hash_lock *new_lock) if ((old_lock->state != VDO_HASH_LOCK_BYPASSING) && (old_lock->state != VDO_HASH_LOCK_UNLOCKING)) { - // If the reference count goes to zero in a non-terminal - // state, we're most likely leaking this lock. - ASSERT_LOG_ONLY(old_lock->reference_count > 1, - "hash locks should only become unreferenced in a terminal state, not state %s", - get_vdo_hash_lock_state_name(old_lock->state)); + /* + * If the reference count goes to zero in a non- + * terminal state, we're most likely leaking this lock. + */ + ASSERT_LOG_ONLY( + old_lock->reference_count > 1, + "hash locks should only become unreferenced in a terminal state, not state %s", + vdo_get_hash_lock_state_name(old_lock->state)); } list_del_init(&data_vio->hash_lock_entry); @@ -302,15 +312,19 @@ static void set_hash_lock(struct data_vio *data_vio, struct hash_lock *new_lock) } if (new_lock != NULL) { - // Keep all data_vios sharing the lock on a ring since they can - // complete in any order and we'll always need a pointer to one - // to compare data. + /* + * Keep all data_vios sharing the lock on a ring since they can + * complete in any order and we'll always need a pointer to one + * to compare data. + */ list_move_tail(&data_vio->hash_lock_entry, &new_lock->duplicate_ring); new_lock->reference_count += 1; - // XXX Not needed for VDOSTORY-190, but useful for checking - // whether a test is getting concurrent dedupe, and how much. + /* + * XXX Not needed for VDOSTORY-190, but useful for checking + * whether a test is getting concurrent dedupe, and how much. + */ if (new_lock->max_references < new_lock->reference_count) { new_lock->max_references = new_lock->reference_count; } @@ -320,35 +334,36 @@ static void set_hash_lock(struct data_vio *data_vio, struct hash_lock *new_lock) } /** - * Bottleneck for data_vios that have written or deduplicated and that are no - * longer needed to be an agent for the hash lock. - * - * @param data_vio The data_vio to complete and send to be cleaned up - **/ + * exit_hash_lock() - Bottleneck for data_vios that have written or + * deduplicated and that are no longer needed to be an + * agent for the hash lock. + * @data_vio: The data_vio to complete and send to be cleaned up. + */ static void exit_hash_lock(struct data_vio *data_vio) { - // XXX trace record? - - // Release the hash lock now, saving a thread transition in cleanup. - release_vdo_hash_lock(data_vio); + /* Release the hash lock now, saving a thread transition in cleanup. */ + vdo_release_hash_lock(data_vio); - // Complete the data_vio and start the clean-up path in vioWrite to - // release any locks it still holds. + /* + * Complete the data_vio and start the clean-up path in vioWrite to + * release any locks it still holds. + */ finish_data_vio(data_vio, VDO_SUCCESS); } /** - * Retire the active lock agent, replacing it with the first lock waiter, and - * make the retired agent exit the hash lock. - * - * @param lock The hash lock to update + * retire_lock_agent() - Retire the active lock agent, replacing it with the + * first lock waiter, and make the retired agent exit + * the hash lock. + * @lock: The hash lock to update. * - * @return The new lock agent (which will be NULL if there was no waiter) - **/ + * Return: The new lock agent (which will be NULL if there was no waiter) + */ static struct data_vio *retire_lock_agent(struct hash_lock *lock) { struct data_vio *old_agent = lock->agent; struct data_vio *new_agent = dequeue_lock_waiter(lock); + set_agent(lock, new_agent); exit_hash_lock(old_agent); if (new_agent != NULL) { @@ -358,80 +373,88 @@ static struct data_vio *retire_lock_agent(struct hash_lock *lock) } /** - * Callback to call vio_compress_data(), putting a data_vio back on the write - * path. - * - * @param completion The data_vio - **/ + * compress_data_callback() - Callback to call launch_compress_data_vio(), + * putting a data_vio back on the write path. + * @completion: The data_vio. + */ static void compress_data_callback(struct vdo_completion *completion) { - // XXX VDOSTORY-190 need an error check since vio_compress_data doesn't - // have one. - vio_compress_data(as_data_vio(completion)); + /* + * XXX VDOSTORY-190 need an error check since launch_compress_data_vio + * doesn't have one. + */ + launch_compress_data_vio(as_data_vio(completion)); } /** - * Add a data_vio to the lock's queue of waiters. - * - * @param lock The hash lock on which to wait - * @param data_vio The data_vio to add to the queue - **/ -static void wait_on_hash_lock(struct hash_lock *lock, struct data_vio *data_vio) + * wait_on_hash_lock() - Add a data_vio to the lock's queue of waiters. + * @lock: The hash lock on which to wait. + * @data_vio: The data_vio to add to the queue. + */ +static void wait_on_hash_lock(struct hash_lock *lock, + struct data_vio *data_vio) { - int result = - enqueue_data_vio(&lock->waiters, data_vio); + int result = enqueue_data_vio(&lock->waiters, data_vio); + if (result != VDO_SUCCESS) { - // This should be impossible, but if it somehow happens, give up - // on trying to dedupe the data. + /* + * This should be impossible, but if it somehow happens, give + * up on trying to dedupe the data. + */ set_hash_lock(data_vio, NULL); continue_data_vio_in(data_vio, result, compress_data_callback); return; } - // Make sure the agent doesn't block indefinitely in the packer since it - // now has at least one other data_vio waiting on it. + /* + * Make sure the agent doesn't block indefinitely in the packer since + * it now has at least one other data_vio waiting on it. + */ if ((lock->state == VDO_HASH_LOCK_WRITING) && cancel_vio_compression(lock->agent)) { /* - * Even though we're waiting, we also have to send ourselves as - * a one-way message to the packer to ensure the agent continues - * executing. This is safe because cancel_vio_compression() - * guarantees the agent won't continue executing until this - * message arrives in the packer, and because the wait queue - * link isn't used for sending the message. + * Even though we're waiting, we also have to send ourselves + * as a one-way message to the packer to ensure the agent + * continues executing. This is safe because + * cancel_vio_compression() guarantees the agent won't + * continue executing until this message arrives in the + * packer, and because the wait queue link isn't used for + * sending the message. */ data_vio->compression.lock_holder = lock->agent; launch_data_vio_packer_callback(data_vio, - remove_lock_holder_from_vdo_packer); + vdo_remove_lock_holder_from_packer); } } /** - * waiter_callback function that calls vio_compress_data on the data_vio - * waiter. - * - * @param waiter The data_vio's waiter link - * @param context Not used - **/ + * compress_waiter() - waiter_callback function that calls + * launch_compress_data_vio on the data_vio waiter. + * @waiter: The data_vio's waiter link. + * @context: Not used. + */ static void compress_waiter(struct waiter *waiter, void *context __always_unused) { struct data_vio *data_vio = waiter_as_data_vio(waiter); + data_vio->is_duplicate = false; - vio_compress_data(data_vio); + launch_compress_data_vio(data_vio); } /** - * Handle the result of the agent for the lock releasing a read lock on - * duplicate candidate due to aborting the hash lock. This continuation is - * registered in unlock_duplicate_pbn(). + * finish_bypassing() - Handle the result of the agent for the lock releasing + * a read lock on duplicate candidate due to aborting the + * hash lock. + * @completion: The completion of the acting as the lock's agent. * - * @param completion The completion of the acting as the lock's agent - **/ + * This continuation is registered in unlock_duplicate_pbn(). + */ static void finish_bypassing(struct vdo_completion *completion) { struct data_vio *agent = as_data_vio(completion); struct hash_lock *lock = agent->hash_lock; + assert_hash_lock_agent(agent, __func__); ASSERT_LOG_ONLY(lock->duplicate_lock == NULL, @@ -440,18 +463,19 @@ static void finish_bypassing(struct vdo_completion *completion) } /** - * Stop using the hash lock, resuming the old write path for the lock agent + * start_bypassing() - Stop using the hash lock. + * @lock: The hash lock. + * @agent: The data_vio acting as the agent for the lock. + * + * Stops using the hash lock, resuming the old write path for the lock agent * and any data_vios waiting on it, and put it in a state where data_vios * entering the lock will use the old dedupe path instead of waiting. - * - * @param lock The hash lock - * @param agent The data_vio acting as the agent for the lock - **/ + */ static void start_bypassing(struct hash_lock *lock, struct data_vio *agent) { set_hash_lock_state(lock, VDO_HASH_LOCK_BYPASSING); - // Ensure we don't attempt to update advice when cleaning up. + /* Ensure we don't attempt to update advice when cleaning up. */ lock->update_advice = false; ASSERT_LOG_ONLY(((agent != NULL) || !has_waiters(&lock->waiters)), @@ -460,11 +484,14 @@ static void start_bypassing(struct hash_lock *lock, struct data_vio *agent) if (lock->duplicate_lock != NULL) { if (agent != NULL) { - // The agent must reference the duplicate zone to - // launch it. + /* + * The agent must reference the duplicate zone to + * launch it. + */ agent->duplicate = lock->duplicate; - launch_data_vio_duplicate_zone_callback(agent, - unlock_duplicate_pbn); + launch_data_vio_duplicate_zone_callback( + agent, + unlock_duplicate_pbn); return; } ASSERT_LOG_ONLY( @@ -478,21 +505,24 @@ static void start_bypassing(struct hash_lock *lock, struct data_vio *agent) set_agent(lock, NULL); agent->is_duplicate = false; - vio_compress_data(agent); + launch_compress_data_vio(agent); } /** - * Abort processing on this hash lock when noticing an error. Currently, this - * moves the hash lock to the BYPASSING state, to release all pending - * data_vios. + * abort_hash_lock() - Abort processing on this hash lock when noticing an + * error. + * @lock: The hash_lock. + * @data_vio: The data_vio with the error. * - * @param lock The hash_lock - * @param data_vio The data_vio with the error - **/ + * Currently, this moves the hash lock to the BYPASSING state, to release all + * pending data_vios. + */ static void abort_hash_lock(struct hash_lock *lock, struct data_vio *data_vio) { - // If we've already aborted the lock, don't try to re-abort it; just - // exit. + /* + * If we've already aborted the lock, don't try to re-abort it; just + * exit. + */ if (lock->state == VDO_HASH_LOCK_BYPASSING) { exit_hash_lock(data_vio); return; @@ -500,17 +530,21 @@ static void abort_hash_lock(struct hash_lock *lock, struct data_vio *data_vio) if (data_vio != lock->agent) { if ((lock->agent != NULL) || (lock->reference_count > 1)) { - // Other data_vios are still sharing the lock (which - // should be DEDUPING), so just kick this one out of - // the lock to report its error. + /* + * Other data_vios are still sharing the lock (which + * should be DEDUPING), so just kick this one out of + * the lock to report its error. + */ ASSERT_LOG_ONLY( lock->agent == NULL, "only active agent should call abort_hash_lock"); exit_hash_lock(data_vio); return; } - // Make the lone data_vio the lock agent so it can abort and - // clean up. + /* + * Make the lone data_vio the lock agent so it can abort and + * clean up. + */ set_agent(lock, data_vio); } @@ -518,16 +552,17 @@ static void abort_hash_lock(struct hash_lock *lock, struct data_vio *data_vio) } /** - * Handle the result of the agent for the lock releasing a read lock on - * duplicate candidate. This continuation is registered in - * unlock_duplicate_pbn(). + * finish_unlocking() - Handle the result of the agent for the lock releasing + * a read lock on duplicate candidate. + * @completion: The completion of the data_vio acting as the lock's agent. * - * @param completion The completion of the data_vio acting as the lock's agent - **/ + * This continuation is registered in unlock_duplicate_pbn(). + */ static void finish_unlocking(struct vdo_completion *completion) { struct data_vio *agent = as_data_vio(completion); struct hash_lock *lock = agent->hash_lock; + assert_hash_lock_agent(agent, __func__); ASSERT_LOG_ONLY( @@ -551,8 +586,10 @@ static void finish_unlocking(struct vdo_completion *completion) return; } - // With the lock released, the verified duplicate block may already - // have changed and will need to be re-verified if a waiter arrived. + /* + * With the lock released, the verified duplicate block may already + * have changed and will need to be re-verified if a waiter arrived. + */ lock->verified = false; if (has_waiters(&lock->waiters)) { @@ -561,9 +598,10 @@ static void finish_unlocking(struct vdo_completion *completion) * hash lock while the agent was releasing the PBN lock. The * current agent exits and the waiter has to re-lock and * re-verify the duplicate location. + * + * XXX VDOSTORY-190 If we used the current agent to re-acquire + * the PBN lock we wouldn't need to re-verify. */ - // XXX VDOSTORY-190 If we used the current agent to re-acquire - // the PBN lock we wouldn't need to re-verify. agent = retire_lock_agent(lock); start_locking(lock, agent); return; @@ -578,64 +616,60 @@ static void finish_unlocking(struct vdo_completion *completion) } /** - * Release a read lock on the PBN of the block that may or may not have - * contained duplicate data. This continuation is launched by - * start_unlocking(), and calls back to finish_unlocking() on the hash zone - * thread. + * unlock_duplicate_pbn() - Release a read lock on the PBN of the block that + * may or may not have contained duplicate data. + * @completion: The completion of the data_vio acting as the lock's agent. * - * @param completion The completion of the data_vio acting as the lock's agent - **/ + * This continuation is launched by start_unlocking(), and calls back to + * finish_unlocking() on the hash zone thread. + */ static void unlock_duplicate_pbn(struct vdo_completion *completion) { struct data_vio *agent = as_data_vio(completion); struct hash_lock *lock = agent->hash_lock; + assert_data_vio_in_duplicate_zone(agent); ASSERT_LOG_ONLY(lock->duplicate_lock != NULL, "must have a duplicate lock to release"); - release_vdo_physical_zone_pbn_lock(agent->duplicate.zone, + vdo_release_physical_zone_pbn_lock(agent->duplicate.zone, agent->duplicate.pbn, UDS_FORGET(lock->duplicate_lock)); if (lock->state == VDO_HASH_LOCK_BYPASSING) { launch_data_vio_hash_zone_callback(agent, - finish_bypassing); + finish_bypassing); } else { launch_data_vio_hash_zone_callback(agent, finish_unlocking); } } /** - * Release a read lock on the PBN of the block that may or may not have - * contained duplicate data. - * - * @param lock The hash lock - * @param agent The data_vio currently acting as the agent for the lock - **/ + * start_unlocking() - Release a read lock on the PBN of the block that may or + * may not have contained duplicate data. + * @lock: The hash lock. + * @agent: The data_vio currently acting as the agent for the lock. + */ static void start_unlocking(struct hash_lock *lock, struct data_vio *agent) { set_hash_lock_state(lock, VDO_HASH_LOCK_UNLOCKING); - /* - * XXX If we arrange to continue on the duplicate zone thread when - * verification fails, and don't explicitly change lock states (or use - * an agent-local state, or an atomic), we can avoid a thread - * transition here. - */ launch_data_vio_duplicate_zone_callback(agent, unlock_duplicate_pbn); } /** - * Process the result of a UDS update performed by the agent for the lock. - * This continuation is registered in start_querying(). + * finish_updating() - Process the result of a UDS update performed by the + * agent for the lock. + * @completion: The completion of the data_vio that performed the update * - * @param completion The completion of the data_vio that performed the update - **/ + * This continuation is registered in start_querying(). + */ static void finish_updating(struct vdo_completion *completion) { struct data_vio *agent = as_data_vio(completion); struct hash_lock *lock = agent->hash_lock; + assert_hash_lock_agent(agent, __func__); if (completion->result != VDO_SUCCESS) { @@ -643,8 +677,10 @@ static void finish_updating(struct vdo_completion *completion) return; } - // UDS was updated successfully, so don't update again unless the - // duplicate location changes due to rollover. + /* + * UDS was updated successfully, so don't update again unless the + * duplicate location changes due to rollover. + */ lock->update_advice = false; if (has_waiters(&lock->waiters)) { @@ -668,44 +704,59 @@ static void finish_updating(struct vdo_completion *completion) /* * UPDATING -> DESTROYING transition: No one is waiting to * dedupe and there's no lock to release. + * + * XXX startDestroying(lock, agent); */ - // XXX startDestroying(lock, agent); start_bypassing(lock, NULL); exit_hash_lock(agent); } } +static void update_index(struct vdo_completion *completion) +{ + struct data_vio *data_vio = as_data_vio(completion); + + set_data_vio_hash_zone_callback(data_vio, finish_updating); + vdo_query_index(data_vio, UDS_UPDATE); +} + /** - * Continue deduplication with the last step, updating UDS with the location - * of the duplicate that should be returned as advice in the future. - * - * @param lock The hash lock - * @param agent The data_vio currently acting as the agent for the lock - **/ + * start_updating() - Continue deduplication with the last step, updating UDS + * with the location of the duplicate that should be + * returned as advice in the future. + * @lock: The hash lock. + * @agent: The data_vio currently acting as the agent for the lock. + */ static void start_updating(struct hash_lock *lock, struct data_vio *agent) { set_hash_lock_state(lock, VDO_HASH_LOCK_UPDATING); - ASSERT_LOG_ONLY(lock->verified, "new advice should have been verified"); + ASSERT_LOG_ONLY(lock->verified, + "new advice should have been verified"); ASSERT_LOG_ONLY(lock->update_advice, "should only update advice if needed"); agent->last_async_operation = VIO_ASYNC_OP_UPDATE_DEDUPE_INDEX; - set_data_vio_hash_zone_callback(agent, finish_updating); - vdo_update_dedupe_index(agent); + if (data_vio_may_query_index(agent)) { + launch_data_vio_dedupe_callback(agent, update_index); + return; + } + + launch_data_vio_hash_zone_callback(agent, finish_updating); } /** - * Handle a data_vio that has finished deduplicating against the block locked - * by the hash lock. If there are other data_vios still sharing the lock, this - * will just release the data_vio's share of the lock and finish processing the - * data_vio. If this is the last data_vio holding the lock, this makes the - * data_vio the lock agent and uses it to advance the state of the lock so it - * can eventually be released. + * finish_deduping() - Handle a data_vio that has finished deduplicating + * against the block locked by the hash lock. + * @lock: The hash lock. + * @data_vio: The lock holder that has finished deduplicating. * - * @param lock The hash lock - * @param data_vio The lock holder that has finished deduplicating - **/ + * If there are other data_vios still sharing the lock, this will just release + * the data_vio's share of the lock and finish processing the data_vio. If + * this is the last data_vio holding the lock, this makes the data_vio the + * lock agent and uses it to advance the state of the lock so it can + * eventually be released. + */ static void finish_deduping(struct hash_lock *lock, struct data_vio *data_vio) { struct data_vio *agent = data_vio; @@ -715,14 +766,16 @@ static void finish_deduping(struct hash_lock *lock, struct data_vio *data_vio) ASSERT_LOG_ONLY(!has_waiters(&lock->waiters), "shouldn't have any lock waiters in DEDUPING"); - // Just release the lock reference if other data_vios are still - // deduping. + /* + * Just release the lock reference if other data_vios are still + * deduping. + */ if (lock->reference_count > 1) { exit_hash_lock(data_vio); return; } - // The hash lock must have an agent for all other lock states. + /* The hash lock must have an agent for all other lock states. */ set_agent(lock, agent); if (lock->update_advice) { @@ -747,9 +800,11 @@ static void finish_deduping(struct hash_lock *lock, struct data_vio *data_vio) } /** + * enter_forked_lock() - Bind the data_vio to a new hash lock. + * * Implements waiter_callback. Binds the data_vio that was waiting to a new * hash lock and waits on that lock. - **/ + */ static void enter_forked_lock(struct waiter *waiter, void *context) { struct data_vio *data_vio = waiter_as_data_vio(waiter); @@ -760,19 +815,20 @@ static void enter_forked_lock(struct waiter *waiter, void *context) } /** - * Fork a hash lock because it has run out of increments on the duplicate PBN. + * fork_hash_lock() - Fork a hash lock because it has run out of increments on + * the duplicate PBN. + * @old_lock: The hash lock to fork. + * @new_agent: The data_vio that will be the agent for the new lock. + * * Transfers the new agent and any lock waiters to a new hash lock instance * which takes the place of the old lock in the lock map. The old lock remains * active, but will not update advice. - * - * @param old_lock The hash lock to fork - * @param new_agent The data_vio that will be the agent for the new lock - **/ + */ static void fork_hash_lock(struct hash_lock *old_lock, struct data_vio *new_agent) { struct hash_lock *new_lock; - int result = acquire_lock_from_vdo_hash_zone(new_agent->hash_zone, + int result = vdo_acquire_lock_from_hash_zone(new_agent->hash_zone, &new_agent->chunk_name, old_lock, &new_lock); if (result != VDO_SUCCESS) { @@ -780,8 +836,10 @@ static void fork_hash_lock(struct hash_lock *old_lock, return; } - // Only one of the two locks should update UDS. The old lock is out of - // references, so it would be poor dedupe advice in the short term. + /* + * Only one of the two locks should update UDS. The old lock is out of + * references, so it would be poor dedupe advice in the short term. + */ old_lock->update_advice = false; new_lock->update_advice = true; @@ -795,56 +853,64 @@ static void fork_hash_lock(struct hash_lock *old_lock, } /** - * Reserve a reference count increment for a data_vio and launch it on the - * dedupe path. If no increments are available, this will roll over to a new - * hash lock and launch the data_vio as the writing agent for that lock. - * - * @param lock The hash lock - * @param data_vio The data_vio to deduplicate using the hash lock - * @param has_claim true if the data_vio already has claimed - * an increment from the duplicate lock - **/ -static void launch_dedupe(struct hash_lock *lock, struct data_vio *data_vio, + * launch_dedupe() - Reserve a reference count increment for a data_vio and + * launch it on the dedupe path. + * @lock: The hash lock. + * @data_vio: The data_vio to deduplicate using the hash lock. + * @has_claim: true if the data_vio already has claimed an increment from the + * duplicate lock. + * + * If no increments are available, this will roll over to a new hash lock and + * launch the data_vio as the writing agent for that lock. + */ +static void launch_dedupe(struct hash_lock *lock, + struct data_vio *data_vio, bool has_claim) { - if (!has_claim && !claim_vdo_pbn_lock_increment(lock->duplicate_lock)) { - // Out of increments, so must roll over to a new lock. + if (!has_claim && + !vdo_claim_pbn_lock_increment(lock->duplicate_lock)) { + /* Out of increments, so must roll over to a new lock. */ fork_hash_lock(lock, data_vio); return; } - // Deduplicate against the lock's verified location. + /* Deduplicate against the lock's verified location. */ set_data_vio_duplicate_location(data_vio, lock->duplicate); - launch_data_vio_duplicate_zone_callback(data_vio, share_block); + launch_deduplicate_data_vio(data_vio); } /** - * Enter the hash lock state where data_vios deduplicate in parallel against a - * true copy of their data on disk. If the agent itself needs to deduplicate, - * an increment for it must already have been claimed from the duplicate lock, - * ensuring the hash lock will still have a data_vio holding it. + * start_deduping() - Enter the hash lock state where data_vios deduplicate in + * parallel against a true copy of their data on disk. + * @lock: The hash lock. + * @agent: The data_vio acting as the agent for the lock. + * @agent_is_done: true only if the agent has already written or deduplicated + * against its data. * - * @param lock The hash lock - * @param agent The data_vio acting as the agent for the lock - * @param agent_is_done true only if the agent has already - * written or deduplicated against its data - **/ -static void start_deduping(struct hash_lock *lock, struct data_vio *agent, - bool agent_is_done) + * If the agent itself needs to deduplicate, an increment for it must already + * have been claimed from the duplicate lock, ensuring the hash lock will + * still have a data_vio holding it. + */ +static void start_deduping(struct hash_lock *lock, + struct data_vio *agent, + bool agent_is_done) { set_hash_lock_state(lock, VDO_HASH_LOCK_DEDUPING); - // We don't take the downgraded allocation lock from the agent unless - // we actually need to deduplicate against it. + /* + * We don't take the downgraded allocation lock from the agent unless + * we actually need to deduplicate against it. + */ if (lock->duplicate_lock == NULL) { - ASSERT_LOG_ONLY(!vdo_is_state_compressed(agent->new_mapped.state), - "compression must have shared a lock"); + ASSERT_LOG_ONLY( + !vdo_is_state_compressed(agent->new_mapped.state), + "compression must have shared a lock"); ASSERT_LOG_ONLY(agent_is_done, "agent must have written the new duplicate"); transfer_allocation_lock(agent); } - ASSERT_LOG_ONLY(is_vdo_pbn_read_lock(lock->duplicate_lock), + ASSERT_LOG_ONLY(vdo_is_pbn_read_lock(lock->duplicate_lock), "duplicate_lock must be a PBN read lock"); /* @@ -880,42 +946,51 @@ static void start_deduping(struct hash_lock *lock, struct data_vio *agent, } /** - * Handle the result of the agent for the lock comparing its data to the - * duplicate candidate. This continuation is registered in start_verifying(). + * finish_verifying() - Handle the result of the agent for the lock comparing + * its data to the duplicate candidate. + * @completion: The completion of the data_vio used to verify dedupe * - * @param completion The completion of the data_vio used to verify dedupe - **/ + * This continuation is registered in start_verifying(). + */ static void finish_verifying(struct vdo_completion *completion) { struct data_vio *agent = as_data_vio(completion); struct hash_lock *lock = agent->hash_lock; + assert_hash_lock_agent(agent, __func__); if (completion->result != VDO_SUCCESS) { - // XXX VDOSTORY-190 should convert verify IO errors to - // verification failure + /* + * XXX VDOSTORY-190 should convert verify IO errors to + * verification failure + */ abort_hash_lock(lock, agent); return; } lock->verified = agent->is_duplicate; - // Only count the result of the initial verification of the advice as - // valid or stale, and not any re-verifications due to PBN lock - // releases. + /* + * Only count the result of the initial verification of the advice as + * valid or stale, and not any re-verifications due to PBN lock + * releases. + */ if (!lock->verify_counted) { lock->verify_counted = true; if (lock->verified) { - bump_vdo_hash_zone_valid_advice_count(agent->hash_zone); + vdo_bump_hash_zone_valid_advice_count(agent->hash_zone); } else { - bump_vdo_hash_zone_stale_advice_count(agent->hash_zone); + vdo_bump_hash_zone_stale_advice_count(agent->hash_zone); } } - // Even if the block is a verified duplicate, we can't start to - // deduplicate unless we can claim a reference count increment for the - // agent. - if (lock->verified && !claim_vdo_pbn_lock_increment(lock->duplicate_lock)) { + /* + * Even if the block is a verified duplicate, we can't start to + * deduplicate unless we can claim a reference count increment for the + * agent. + */ + if (lock->verified && + !vdo_claim_pbn_lock_increment(lock->duplicate_lock)) { agent->is_duplicate = false; lock->verified = false; } @@ -941,51 +1016,126 @@ static void finish_verifying(struct vdo_completion *completion) } } +static bool blocks_equal(char *block1, char *block2) +{ + int i; + + + for (i = 0; i < VDO_BLOCK_SIZE; i += sizeof(uint64_t)) { + if (*((uint64_t *) &block1[i]) != *((uint64_t *) &block2[i])) { + return false; + } + } + + return true; +} + +static void verify_callback(struct vdo_completion *completion) +{ + struct data_vio *agent = as_data_vio(completion); + + agent->is_duplicate = blocks_equal(agent->data_block, + agent->scratch_block); + launch_data_vio_hash_zone_callback(agent, finish_verifying); +} + +static void uncompress_and_verify(struct vdo_completion *completion) +{ + struct data_vio *agent = as_data_vio(completion); + int result = uncompress_data_vio(agent, + agent->duplicate.state, + agent->scratch_block); + + if (result == VDO_SUCCESS) { + verify_callback(completion); + return; + } + + agent->is_duplicate = false; + launch_data_vio_hash_zone_callback(agent, finish_verifying); +} + +static void verify_endio(struct bio *bio) +{ + struct data_vio *agent = vio_as_data_vio(bio->bi_private); + int result = blk_status_to_errno(bio->bi_status); + + vdo_count_completed_bios(bio); + if (result != VDO_SUCCESS) { + launch_data_vio_hash_zone_callback(agent, finish_verifying); + return; + } + + if (vdo_is_state_compressed(agent->duplicate.state)) { + launch_data_vio_cpu_callback(agent, + uncompress_and_verify, + CPU_Q_COMPRESS_BLOCK_PRIORITY); + return; + } + + launch_data_vio_cpu_callback(agent, + verify_callback, + CPU_Q_COMPLETE_READ_PRIORITY); +} + /** + * start_verifying() - Begin the data verification phase. + * @lock: The hash lock (must be LOCKING). + * @agent: The data_vio to use to read and compare candidate data. + * * Continue the deduplication path for a hash lock by using the agent to read * (and possibly decompress) the data at the candidate duplicate location, * comparing it to the data in the agent to verify that the candidate is * identical to all the data_vios sharing the hash. If so, it can be * deduplicated against, otherwise a data_vio allocation will have to be * written to and used for dedupe. - * - * @param lock The hash lock (must be LOCKING) - * @param agent The data_vio to use to read and compare candidate data - **/ + */ static void start_verifying(struct hash_lock *lock, struct data_vio *agent) { + int result; + char *buffer = (vdo_is_state_compressed(agent->duplicate.state) + ? (char *) agent->compression.block + : agent->scratch_block); + set_hash_lock_state(lock, VDO_HASH_LOCK_VERIFYING); - ASSERT_LOG_ONLY(!lock->verified, "hash lock only verifies advice once"); + ASSERT_LOG_ONLY(!lock->verified, + "hash lock only verifies advice once"); - /* - * XXX VDOSTORY-190 Optimization: This is one of those places where the - * zone and continuation we want to use depends on the outcome of the - * comparison. If we could choose which path in the layer thread before - * continuing, we could save a thread transition in one of the two - * cases (assuming we're willing to delay visibility of the the hash - * lock state change). - */ agent->last_async_operation = VIO_ASYNC_OP_VERIFY_DUPLICATION; - set_data_vio_hash_zone_callback(agent, finish_verifying); - verify_data_vio_duplication(agent); + result = prepare_data_vio_for_io(agent, + buffer, + verify_endio, + REQ_OP_READ, + agent->duplicate.pbn); + if (result != VDO_SUCCESS) { + set_data_vio_hash_zone_callback(agent, finish_verifying); + continue_data_vio(agent, result); + return; + } + + set_data_vio_bio_zone_callback(agent, process_vio_io); + vdo_invoke_completion_callback_with_priority(data_vio_as_completion(agent), + BIO_Q_VERIFY_PRIORITY); } /** - * Handle the result of the agent for the lock attempting to obtain a PBN read - * lock on the candidate duplicate block. this continuation is registered in - * lock_duplicate_pbn(). + * finish_locking() - Handle the result of the agent for the lock attempting + * to obtain a PBN read lock on the candidate duplicate + * block. + * @completion: The completion of the data_vio that attempted to get + * the read lock. * - * @param completion The completion of the data_vio that attempted to get - * the read lock - **/ + * This continuation is registered in lock_duplicate_pbn(). + */ static void finish_locking(struct vdo_completion *completion) { struct data_vio *agent = as_data_vio(completion); struct hash_lock *lock = agent->hash_lock; + assert_hash_lock_agent(agent, __func__); if (completion->result != VDO_SUCCESS) { - // XXX clearDuplicateLocation()? + /* XXX clearDuplicateLocation()? */ agent->is_duplicate = false; abort_hash_lock(lock, agent); return; @@ -1001,7 +1151,7 @@ static void finish_locking(struct vdo_completion *completion) * compress the data, remembering to update UDS later with the * new advice. */ - bump_vdo_hash_zone_stale_advice_count(agent->hash_zone); + vdo_bump_hash_zone_stale_advice_count(agent->hash_zone); lock->update_advice = true; start_writing(lock, agent); return; @@ -1021,7 +1171,7 @@ static void finish_locking(struct vdo_completion *completion) return; } - if (!claim_vdo_pbn_lock_increment(lock->duplicate_lock)) { + if (!vdo_claim_pbn_lock_increment(lock->duplicate_lock)) { /* * LOCKING -> UNLOCKING transition: The verified block was * re-locked, but has no available increments left. Must first @@ -1044,15 +1194,17 @@ static void finish_locking(struct vdo_completion *completion) } /** - * Acquire a read lock on the PBN of the block containing candidate duplicate - * data (compressed or uncompressed). If the PBN is already locked for - * writing, the lock attempt is abandoned and is_duplicate will be cleared - * before calling back. this continuation is launched from start_locking(), and - * calls back to finish_locking() on the hash zone thread. + * lock_duplicate_pbn() - Acquire a read lock on the PBN of the block + * containing candidate duplicate data (compressed or + * uncompressed). + * @completion: The completion of the data_vio attempting to acquire the + * physical block lock on behalf of its hash lock. * - * @param completion The completion of the data_vio attempting to acquire the - * physical block lock on behalf of its hash lock - **/ + * If the PBN is already locked for writing, the lock attempt is abandoned and + * is_duplicate will be cleared before calling back. this continuation is + * launched from start_locking(), and calls back to finish_locking() on the + * hash zone thread. + */ static void lock_duplicate_pbn(struct vdo_completion *completion) { unsigned int increment_limit; @@ -1060,36 +1212,39 @@ static void lock_duplicate_pbn(struct vdo_completion *completion) int result; struct data_vio *agent = as_data_vio(completion); - struct slab_depot *depot = get_slab_depot(get_vdo_from_data_vio(agent)); + struct slab_depot *depot = vdo_from_data_vio(agent)->depot; struct physical_zone *zone = agent->duplicate.zone; assert_data_vio_in_duplicate_zone(agent); set_data_vio_hash_zone_callback(agent, finish_locking); - // While in the zone that owns it, find out how many additional - // references can be made to the block if it turns out to truly be a - // duplicate. + /* + * While in the zone that owns it, find out how many additional + * references can be made to the block if it turns out to truly be a + * duplicate. + */ increment_limit = vdo_get_increment_limit(depot, agent->duplicate.pbn); if (increment_limit == 0) { - // We could deduplicate against it later if a reference - // happened to be released during verification, but it's - // probably better to bail out now. - // XXX clearDuplicateLocation()? + /* + * We could deduplicate against it later if a reference + * happened to be released during verification, but it's + * probably better to bail out now. + * XXX clearDuplicateLocation()? + */ agent->is_duplicate = false; continue_data_vio(agent, VDO_SUCCESS); return; } - result = attempt_vdo_physical_zone_pbn_lock(zone, agent->duplicate.pbn, - VIO_READ_LOCK, - &lock); + result = vdo_attempt_physical_zone_pbn_lock(zone, agent->duplicate.pbn, + VIO_READ_LOCK, &lock); if (result != VDO_SUCCESS) { continue_data_vio(agent, result); return; } - if (!is_vdo_pbn_read_lock(lock)) { + if (!vdo_is_pbn_read_lock(lock)) { /* * There are three cases of write locks: uncompressed data * block writes, compressed (packed) block writes, and block @@ -1133,26 +1288,26 @@ static void lock_duplicate_pbn(struct vdo_completion *completion) * almost certainly outweighs saving a UDS update and trading * a write for a read in a lucky case where advice would have * been saved from becoming stale. + * XXX clearDuplicateLocation()? */ - // XXX clearDuplicateLocation()? agent->is_duplicate = false; continue_data_vio(agent, VDO_SUCCESS); return; } if (lock->holder_count == 0) { - // Ensure that the newly-locked block is referenced. - struct vdo_slab *slab = get_vdo_slab(depot, agent->duplicate.pbn); - result = vdo_acquire_provisional_reference(slab, - agent->duplicate.pbn, - lock); + /* Ensure that the newly-locked block is referenced. */ + struct vdo_slab *slab = + vdo_get_slab(depot, agent->duplicate.pbn); + + result = vdo_acquire_provisional_reference( + slab, agent->duplicate.pbn, lock); if (result != VDO_SUCCESS) { uds_log_warning_strerror(result, "Error acquiring provisional reference for dedupe candidate; aborting dedupe"); agent->is_duplicate = false; - release_vdo_physical_zone_pbn_lock(zone, - agent->duplicate.pbn, - UDS_FORGET(lock)); + vdo_release_physical_zone_pbn_lock( + zone, agent->duplicate.pbn, UDS_FORGET(lock)); continue_data_vio(agent, result); return; } @@ -1166,8 +1321,10 @@ static void lock_duplicate_pbn(struct vdo_completion *completion) lock->increment_limit = increment_limit; } - // We've successfully acquired a read lock on behalf of the hash lock, - // so mark it as such. + /* + * We've successfully acquired a read lock on behalf of the hash lock, + * so mark it as such. + */ set_duplicate_lock(agent->hash_lock, lock); /* @@ -1180,12 +1337,11 @@ static void lock_duplicate_pbn(struct vdo_completion *completion) } /** - * Continue deduplication for a hash lock that has obtained valid advice - * of a potential duplicate through its agent. - * - * @param lock The hash lock (currently must be QUERYING) - * @param agent The data_vio bearing the dedupe advice - **/ + * start_locking() - Continue deduplication for a hash lock that has obtained + * valid advice of a potential duplicate through its agent. + * @lock: The hash lock (currently must be QUERYING). + * @agent: The data_vio bearing the dedupe advice. + */ static void start_locking(struct hash_lock *lock, struct data_vio *agent) { ASSERT_LOG_ONLY( @@ -1205,34 +1361,40 @@ static void start_locking(struct hash_lock *lock, struct data_vio *agent) } /** - * Re-entry point for the lock agent after it has finished writing or - * compressing its copy of the data block. The agent will never need to dedupe - * against anything, so it's done with the lock, but the lock may not be - * finished with it, as a UDS update might still be needed. + * finish_writing() - Re-entry point for the lock agent after it has finished + * writing or compressing its copy of the data block. + * @lock: The hash lock, which must be in state WRITING. + * @agent: The data_vio that wrote its data for the lock. + * + * The agent will never need to dedupe against anything, so it's done with the + * lock, but the lock may not be finished with it, as a UDS update might still + * be needed. * * If there are other lock holders, the agent will hand the job to one of them * and exit, leaving the lock to deduplicate against the just-written block. * If there are no other lock holders, the agent either exits (and later tears * down the hash lock), or it remains the agent and updates UDS. - * - * @param lock The hash lock, which must be in state WRITING - * @param agent The data_vio that wrote its data for the lock - **/ + */ static void finish_writing(struct hash_lock *lock, struct data_vio *agent) { - // Dedupe against the data block or compressed block slot the agent - // wrote. Since we know the write succeeded, there's no need to verify - // it. + /* + * Dedupe against the data block or compressed block slot the agent + * wrote. Since we know the write succeeded, there's no need to verify + * it. + */ lock->duplicate = agent->new_mapped; lock->verified = true; - if (vdo_is_state_compressed(lock->duplicate.state) && lock->registered) { - // Compression means the location we gave in the UDS query is - // not the location we're using to deduplicate. + if (vdo_is_state_compressed(lock->duplicate.state) && + lock->registered) { + /* + * Compression means the location we gave in the UDS query is + * not the location we're using to deduplicate. + */ lock->update_advice = true; } - // If there are any waiters, we need to start deduping them. + /* If there are any waiters, we need to start deduping them. */ if (has_waiters(&lock->waiters)) { /* * WRITING -> DEDUPING transition: an asynchronously-written @@ -1245,9 +1407,11 @@ static void finish_writing(struct hash_lock *lock, struct data_vio *agent) return; } - // There are no waiters and the agent has successfully written, so take - // a step towards being able to release the hash lock (or just release - // it). + /* + * There are no waiters and the agent has successfully written, so take + * a step towards being able to release the hash lock (or just release + * it). + */ if (lock->update_advice) { /* * WRITING -> UPDATING transition: There's no waiter and a UDS @@ -1271,19 +1435,21 @@ static void finish_writing(struct hash_lock *lock, struct data_vio *agent) * and lock have no more work to do. The agent will release its * allocation lock in cleanup. */ - // XXX startDestroying(lock, agent); + /* XXX startDestroying(lock, agent); */ start_bypassing(lock, NULL); exit_hash_lock(agent); } } /** - * Search through the lock waiters for a data_vio that has an allocation. If - * one is found, swap agents, put the old agent at the head of the wait queue, - * then return the new agent. Otherwise, just return the current agent. + * select_writing_agent() - Search through the lock waiters for a data_vio + * that has an allocation. + * @lock: The hash lock to modify. * - * @param lock The hash lock to modify - **/ + * If an allocation is found, swap agents, put the old agent at the head of + * the wait queue, then return the new agent. Otherwise, just return the + * current agent. + */ static struct data_vio *select_writing_agent(struct hash_lock *lock) { struct wait_queue temp_queue; @@ -1292,71 +1458,88 @@ static struct data_vio *select_writing_agent(struct hash_lock *lock) initialize_wait_queue(&temp_queue); - // This should-be-impossible condition is the only cause for - // enqueue_data_vio() to fail later on, where it would be a pain to - // handle. + /* + * This should-be-impossible condition is the only cause for + * enqueue_data_vio() to fail later on, where it would be a pain to + * handle. + */ result = ASSERT(!is_waiting(data_vio_as_waiter(lock->agent)), "agent must not be waiting"); if (result != VDO_SUCCESS) { return lock->agent; } - // Move waiters to the temp queue one-by-one until we find an - // allocation. Not ideal to search, but it only happens when nearly out - // of space. + /* + * Move waiters to the temp queue one-by-one until we find an + * allocation. Not ideal to search, but it only happens when nearly out + * of space. + */ while (((data_vio = dequeue_lock_waiter(lock)) != NULL) && !data_vio_has_allocation(data_vio)) { - // Use the lower-level enqueue since we're just moving waiters - // around. + /* + * Use the lower-level enqueue since we're just moving waiters + * around. + */ result = enqueue_waiter(&temp_queue, - data_vio_as_waiter(data_vio)); - // The only error is the data_vio already being on a wait queue, - // and since we just dequeued it, that could only happen due to - // a memory smash or concurrent use of that data_vio. + data_vio_as_waiter(data_vio)); + /* + * The only error is the data_vio already being on a wait + * queue. Since we just dequeued it, that could only happen + * due to a memory smash or concurrent use of that data_vio. + */ ASSERT_LOG_ONLY(result == VDO_SUCCESS, "impossible enqueue_waiter error"); } if (data_vio != NULL) { - // Move the rest of the waiters over to the temp queue, - // preserving the order they arrived at the lock. + /* + * Move the rest of the waiters over to the temp queue, + * preserving the order they arrived at the lock. + */ transfer_all_waiters(&lock->waiters, &temp_queue); - // The current agent is being replaced and will have to wait to - // dedupe; make it the first waiter since it was the first to - // reach the lock. + /* + * The current agent is being replaced and will have to wait to + * dedupe; make it the first waiter since it was the first to + * reach the lock. + */ result = enqueue_data_vio(&lock->waiters, lock->agent); ASSERT_LOG_ONLY(result == VDO_SUCCESS, "impossible enqueue_data_vio error after is_waiting checked"); set_agent(lock, data_vio); } else { - // No one has an allocation, so keep the current agent. + /* No one has an allocation, so keep the current agent. */ data_vio = lock->agent; } - // Swap all the waiters back onto the lock's queue. + /* Swap all the waiters back onto the lock's queue. */ transfer_all_waiters(&temp_queue, &lock->waiters); return data_vio; } /** - * Begin the non-duplicate write path for a hash lock that had no advice, - * selecting a data_vio with an allocation as a new agent, if necessary, - * then resuming the agent on the data_vio write path. + * start_writing() - Begin the non-duplicate write path. + * @lock: The hash lock (currently must be QUERYING). + * @agent: The data_vio currently acting as the agent for the lock. * - * @param lock The hash lock (currently must be QUERYING) - * @param agent The data_vio currently acting as the agent for the lock - **/ + * Begins the non-duplicate write path for a hash lock that had no advice, + * selecting a data_vio with an allocation as a new agent, if necessary, then + * resuming the agent on the data_vio write path. + */ static void start_writing(struct hash_lock *lock, struct data_vio *agent) { set_hash_lock_state(lock, VDO_HASH_LOCK_WRITING); - // The agent might not have received an allocation and so can't be used - // for writing, but it's entirely possible that one of the waiters did. + /* + * The agent might not have received an allocation and so can't be used + * for writing, but it's entirely possible that one of the waiters did. + */ if (!data_vio_has_allocation(agent)) { agent = select_writing_agent(lock); - // If none of the waiters had an allocation, the writes all - // have to fail. + /* + * If none of the waiters had an allocation, the writes all + * have to fail. + */ if (!data_vio_has_allocation(agent)) { /* * XXX VDOSTORY-190 Should we keep a variant of @@ -1364,16 +1547,18 @@ static void start_writing(struct hash_lock *lock, struct data_vio *agent) * immediately if they don't have an allocation? It * might be possible that on some path there would be * non-waiters still referencing the lock, so it would - * remain in the map as everything is currently spelled, - * even if the agent and all the waiters release. + * remain in the map as everything is currently + * spelled, even if the agent and all waiters release. */ start_bypassing(lock, agent); return; } } - // If the agent compresses, it might wait indefinitely in the packer, - // which would be bad if there are any other data_vios waiting. + /* + * If the agent compresses, it might wait indefinitely in the packer, + * which would be bad if there are any other data_vios waiting. + */ if (has_waiters(&lock->waiters)) { cancel_vio_compression(agent); } @@ -1381,17 +1566,18 @@ static void start_writing(struct hash_lock *lock, struct data_vio *agent) /* * Send the agent to the compress/pack/write path in vioWrite. If it * succeeds, it will return to the hash lock via - * continue_vdo_hash_lock() and call finish_writing(). + * vdo_continue_hash_lock() and call finish_writing(). */ - vio_compress_data(agent); + launch_compress_data_vio(agent); } /** - * Process the result of a UDS query performed by the agent for the lock. This - * continuation is registered in start_querying(). + * finish_querying() - Process the result of a UDS query performed by the + * agent for the lock. + * @completion: The completion of the data_vio that performed the query. * - * @param completion The completion of the data_vio that performed the query - **/ + * This continuation is registered in start_querying(). + */ static void finish_querying(struct vdo_completion *completion) { struct data_vio *agent = as_data_vio(completion); @@ -1414,9 +1600,11 @@ static void finish_querying(struct vdo_completion *completion) */ start_locking(lock, agent); } else { - // The agent will be used as the duplicate if has an - // allocation; if it does, that location was posted to UDS, so - // no update will be needed. + /* + * The agent will be used as the duplicate if has an + * allocation; if it does, that location was posted to UDS, so + * no update will be needed. + */ lock->update_advice = !data_vio_has_allocation(agent); /* * QUERYING -> WRITING transition: There was no advice or the @@ -1426,44 +1614,68 @@ static void finish_querying(struct vdo_completion *completion) } } +static void query_index(struct vdo_completion *completion) +{ + struct data_vio *data_vio = as_data_vio(completion); + + set_data_vio_hash_zone_callback(data_vio, finish_querying); + vdo_query_index(data_vio, + (data_vio_has_allocation(data_vio) + ? UDS_POST : UDS_QUERY)); +} + /** - * Start deduplication for a hash lock that has finished initializing by + * start_querying() - Start deduplicatoin for a hash lock. + * @lock: The initialized hash lock. + * @data_vio: The data_vio that has just obtained the new lock. + * + * Starts deduplication for a hash lock that has finished initializing by * making the data_vio that requested it the agent, entering the QUERYING * state, and using the agent to perform the UDS query on behalf of the lock. - * - * @param lock The initialized hash lock - * @param data_vio The data_vio that has just obtained the new lock - **/ + */ static void start_querying(struct hash_lock *lock, struct data_vio *data_vio) { set_agent(lock, data_vio); - set_hash_lock_state(lock, VDO_HASH_LOCK_QUERYING); + if (!data_vio_may_query_index(data_vio)) { + lock->update_advice = !data_vio_has_allocation(data_vio); + start_writing(lock, data_vio); + return; + } + set_hash_lock_state(lock, VDO_HASH_LOCK_QUERYING); data_vio->last_async_operation = VIO_ASYNC_OP_CHECK_FOR_DUPLICATION; - set_data_vio_hash_zone_callback(data_vio, finish_querying); - check_data_vio_for_duplication(data_vio); + launch_data_vio_dedupe_callback(data_vio, query_index); } /** - * Complain that a data_vio has entered a hash_lock that is in an unimplemented - * or unusable state and continue the data_vio with an error. - * - * @param lock The hash lock - * @param data_vio The data_vio attempting to enter the lock - **/ + * report_bogus_lock_state() - Complain that a data_vio has entered a + * hash_lock that is in an unimplemented or + * unusable state and continue the data_vio with + * an error. + * @lock: The hash lock. + * @data_vio: The data_vio attempting to enter the lock. + */ static void report_bogus_lock_state(struct hash_lock *lock, struct data_vio *data_vio) { int result = ASSERT_FALSE("hash lock must not be in unimplemented state %s", - get_vdo_hash_lock_state_name(lock->state)); + vdo_get_hash_lock_state_name(lock->state)); continue_data_vio_in(data_vio, result, compress_data_callback); } -/**********************************************************************/ -void enter_vdo_hash_lock(struct data_vio *data_vio) +/** + * vdo_enter_hash_lock() - Asynchronously process a data_vio that has just + * acquired its reference to a hash lock. + * @data_vio: The data_vio that has just acquired a lock on its chunk name. + * + * This may place the data_vio on a wait queue, or it may use the data_vio to + * perform operations on the lock's behalf. + */ +void vdo_enter_hash_lock(struct data_vio *data_vio) { struct hash_lock *lock = data_vio->hash_lock; + switch (lock->state) { case VDO_HASH_LOCK_INITIALIZING: start_querying(lock, data_vio); @@ -1475,13 +1687,13 @@ void enter_vdo_hash_lock(struct data_vio *data_vio) case VDO_HASH_LOCK_LOCKING: case VDO_HASH_LOCK_VERIFYING: case VDO_HASH_LOCK_UNLOCKING: - // The lock is busy, and can't be shared yet. + /* The lock is busy, and can't be shared yet. */ wait_on_hash_lock(lock, data_vio); break; case VDO_HASH_LOCK_BYPASSING: - // Bypass dedupe entirely. - vio_compress_data(data_vio); + /* Bypass dedupe entirely. */ + launch_compress_data_vio(data_vio); break; case VDO_HASH_LOCK_DEDUPING: @@ -1489,7 +1701,7 @@ void enter_vdo_hash_lock(struct data_vio *data_vio) break; case VDO_HASH_LOCK_DESTROYING: - // A lock in this state should not be acquired by new VIOs. + /* A lock in this state should not be acquired by new VIOs. */ report_bogus_lock_state(lock, data_vio); break; @@ -1498,12 +1710,26 @@ void enter_vdo_hash_lock(struct data_vio *data_vio) } } -/**********************************************************************/ -void continue_vdo_hash_lock(struct data_vio *data_vio) +/** + * vdo_continue_hash_lock() - Continue the processing state after writing, + * compressing, or deduplicating. + * @data_vio: The data_vio to continue processing in its hash lock. + * + * Asynchronously continue processing a data_vio in its hash lock after it has + * finished writing, compressing, or deduplicating, so it can share the result + * with any data_vios waiting in the hash lock, or update the UDS index, or + * simply release its share of the lock. + * + * Context: This must only be called in the correct thread for the + * hash zone. + */ +void vdo_continue_hash_lock(struct data_vio *data_vio) { struct hash_lock *lock = data_vio->hash_lock; - // XXX VDOSTORY-190 Eventually we may be able to fold the error handling - // in at this point instead of using a separate entry point for it. + /* + * XXX VDOSTORY-190 Eventually we may be able to fold the error + * handling in here instead of using a separate entry point for it. + */ switch (lock->state) { case VDO_HASH_LOCK_WRITING: @@ -1517,10 +1743,13 @@ void continue_vdo_hash_lock(struct data_vio *data_vio) break; case VDO_HASH_LOCK_BYPASSING: - // This data_vio has finished the write path and the lock - // doesn't need it. - // XXX This isn't going to be correct if DEDUPING ever uses - // BYPASSING. + /* + * This data_vio has finished the write path and the lock + * doesn't need it. + * + * XXX This isn't going to be correct if DEDUPING ever uses + * BYPASSING. + */ finish_data_vio(data_vio, VDO_SUCCESS); break; @@ -1531,7 +1760,7 @@ void continue_vdo_hash_lock(struct data_vio *data_vio) case VDO_HASH_LOCK_VERIFYING: case VDO_HASH_LOCK_UNLOCKING: case VDO_HASH_LOCK_DESTROYING: - // A lock in this state should never be re-entered. + /* A lock in this state should never be re-entered. */ report_bogus_lock_state(lock, data_vio); break; @@ -1540,25 +1769,33 @@ void continue_vdo_hash_lock(struct data_vio *data_vio) } } -/**********************************************************************/ -void continue_vdo_hash_lock_on_error(struct data_vio *data_vio) +/** + * vdo_continue_hash_lock_on_error() - Re-enter the hash lock after + * encountering an error, to clean up the + * hash lock. + * @data_vio: The data_vio with an error. + */ +void vdo_continue_hash_lock_on_error(struct data_vio *data_vio) { - // XXX We could simply use continue_vdo_hash_lock() and check for - // errors in that. + /* + * XXX We could simply use vdo_continue_hash_lock() and check for + * errors in that. + */ abort_hash_lock(data_vio->hash_lock, data_vio); } /** + * is_hash_collision() - Check to see if a hash collision has occurred. + * @lock: The lock to check. + * @candidate: The data_vio seeking to share the lock. + * * Check whether the data in data_vios sharing a lock is different than in a * data_vio seeking to share the lock, which should only be possible in the * extremely unlikely case of a hash collision. * - * @param lock The lock to check - * @param candidate The data_vio seeking to share the lock - * - * @return true if the given data_vio must not share the lock - * because it doesn't have the same data as the lock holders - **/ + * Return: true if the given data_vio must not share the lock + * because it doesn't have the same data as the lock holders. + */ static bool is_hash_collision(struct hash_lock *lock, struct data_vio *candidate) { @@ -1570,18 +1807,17 @@ static bool is_hash_collision(struct hash_lock *lock, } lock_holder = data_vio_from_lock_entry(lock->duplicate_ring.next); - collides = !compare_data_vios(lock_holder, candidate); - + collides = !blocks_equal(lock_holder->data_block, + candidate->data_block); if (collides) { - bump_vdo_hash_zone_collision_count(candidate->hash_zone); + vdo_bump_hash_zone_collision_count(candidate->hash_zone); } else { - bump_vdo_hash_zone_data_match_count(candidate->hash_zone); + vdo_bump_hash_zone_data_match_count(candidate->hash_zone); } return collides; } -/**********************************************************************/ static inline int assert_hash_lock_preconditions(const struct data_vio *data_vio) { @@ -1599,16 +1835,25 @@ assert_hash_lock_preconditions(const struct data_vio *data_vio) "must not hold a recovery lock when getting a hash lock"); } -/**********************************************************************/ -int acquire_vdo_hash_lock(struct data_vio *data_vio) +/** + * vdo_acquire_hash_lock() - Acquire or share a lock on a chunk name. + * @data_vio: The data_vio acquiring a lock on its chunk name. + * + * Acquire or share a lock on the hash (chunk name) of the data in a data_vio, + * updating the data_vio to reference the lock. This must only be called in the + * correct thread for the zone. In the unlikely case of a hash collision, this + * function will succeed, but the data_vio will not get a lock reference. + */ +int vdo_acquire_hash_lock(struct data_vio *data_vio) { struct hash_lock *lock; int result = assert_hash_lock_preconditions(data_vio); + if (result != VDO_SUCCESS) { return result; } - result = acquire_lock_from_vdo_hash_zone(data_vio->hash_zone, + result = vdo_acquire_lock_from_hash_zone(data_vio->hash_zone, &data_vio->chunk_name, NULL, &lock); @@ -1617,10 +1862,12 @@ int acquire_vdo_hash_lock(struct data_vio *data_vio) } if (is_hash_collision(lock, data_vio)) { - // Hash collisions are extremely unlikely, but the bogus - // dedupe would be a data corruption. Bypass dedupe entirely - // by leaving hash_lock unset. - // XXX clear hash_zone too? + /* + * Hash collisions are extremely unlikely, but the bogus + * dedupe would be a data corruption. Bypass dedupe entirely + * by leaving hash_lock unset. + * XXX clear hash_zone too? + */ return VDO_SUCCESS; } @@ -1628,10 +1875,23 @@ int acquire_vdo_hash_lock(struct data_vio *data_vio) return VDO_SUCCESS; } -/**********************************************************************/ -void release_vdo_hash_lock(struct data_vio *data_vio) +/** + * vdo_release_hash_lock() - Release a data_vio's share of a hash lock, if + * held, and null out the data_vio's reference to + * it. + * @data_vio: The data_vio releasing its hash lock. + * + * If the data_vio is the only one holding the lock, this also releases any + * resources or locks used by the hash lock (such as a PBN read lock on a + * block containing data with the same hash) and returns the lock to the hash + * zone's lock pool. + * + * Context: This must only be called in the correct thread for the hash zone. + */ +void vdo_release_hash_lock(struct data_vio *data_vio) { struct hash_lock *lock = data_vio->hash_lock; + if (lock == NULL) { return; } @@ -1639,71 +1899,84 @@ void release_vdo_hash_lock(struct data_vio *data_vio) set_hash_lock(data_vio, NULL); if (lock->reference_count > 0) { - // The lock is still in use by other data_vios. + /* The lock is still in use by other data_vios. */ return; } set_hash_lock_state(lock, VDO_HASH_LOCK_DESTROYING); - return_lock_to_vdo_hash_zone(data_vio->hash_zone, lock); + vdo_return_lock_to_hash_zone(data_vio->hash_zone, lock); } /** - * Transfer a data_vio's downgraded allocation PBN lock to the data_vio's hash - * lock, converting it to a duplicate PBN lock. - * - * @param data_vio The data_vio holding the allocation lock to transfer - **/ + * transfer_allocation_lock() - Transfer a data_vio's downgraded allocation + * PBN lock to the data_vio's hash lock, + * converting it to a duplicate PBN lock. + * @data_vio: The data_vio holding the allocation lock to transfer. + */ static void transfer_allocation_lock(struct data_vio *data_vio) { - struct allocating_vio *allocating_vio = - data_vio_as_allocating_vio(data_vio); - struct pbn_lock *pbn_lock = allocating_vio->allocation_lock; + struct allocation *allocation = &data_vio->allocation; struct hash_lock *hash_lock = data_vio->hash_lock; - ASSERT_LOG_ONLY(data_vio->new_mapped.pbn == - get_data_vio_allocation(data_vio), + ASSERT_LOG_ONLY(data_vio->new_mapped.pbn == allocation->pbn, "transferred lock must be for the block written"); - allocating_vio->allocation_lock = NULL; - allocating_vio->allocation = VDO_ZERO_BLOCK; + allocation->pbn = VDO_ZERO_BLOCK; - ASSERT_LOG_ONLY(is_vdo_pbn_read_lock(pbn_lock), + ASSERT_LOG_ONLY(vdo_is_pbn_read_lock(allocation->lock), "must have downgraded the allocation lock before transfer"); hash_lock->duplicate = data_vio->new_mapped; data_vio->duplicate = data_vio->new_mapped; - // Since the lock is being transferred, the holder count doesn't change - // (and isn't even safe to examine on this thread). - hash_lock->duplicate_lock = pbn_lock; + /* + * Since the lock is being transferred, the holder count doesn't change + * (and isn't even safe to examine on this thread). + */ + hash_lock->duplicate_lock = UDS_FORGET(allocation->lock); } -/**********************************************************************/ -void share_compressed_vdo_write_lock(struct data_vio *data_vio, - struct pbn_lock *pbn_lock) +/** + * vdo_share_compressed_write_lock() - Make a data_vio's hash lock a shared + * holder of the PBN lock on the + * compressed block to which its data was + * just written. + * @data_vio: The data_vio which was just compressed. + * @pbn_lock: The PBN lock on the compressed block. + * + * If the lock is still a write lock (as it will be for the first share), it + * will be converted to a read lock. This also reserves a reference count + * increment for the data_vio. + */ +void vdo_share_compressed_write_lock(struct data_vio *data_vio, + struct pbn_lock *pbn_lock) { bool claimed; - ASSERT_LOG_ONLY(get_vdo_duplicate_lock(data_vio) == NULL, + ASSERT_LOG_ONLY(vdo_get_duplicate_lock(data_vio) == NULL, "a duplicate PBN lock should not exist when writing"); ASSERT_LOG_ONLY(vdo_is_state_compressed(data_vio->new_mapped.state), "lock transfer must be for a compressed write"); assert_data_vio_in_new_mapped_zone(data_vio); - // First sharer downgrades the lock. - if (!is_vdo_pbn_read_lock(pbn_lock)) { - downgrade_vdo_pbn_write_lock(pbn_lock); + /* First sharer downgrades the lock. */ + if (!vdo_is_pbn_read_lock(pbn_lock)) { + vdo_downgrade_pbn_write_lock(pbn_lock, true); } - // Get a share of the PBN lock, ensuring it cannot be released until - // after this data_vio has had a chance to journal a reference. + /* + * Get a share of the PBN lock, ensuring it cannot be released until + * after this data_vio has had a chance to journal a reference. + */ data_vio->duplicate = data_vio->new_mapped; data_vio->hash_lock->duplicate = data_vio->new_mapped; set_duplicate_lock(data_vio->hash_lock, pbn_lock); - // Claim a reference for this data_vio, which is necessary since another - // hash_lock might start deduplicating against it before our incRef. - claimed = claim_vdo_pbn_lock_increment(pbn_lock); + /* + * Claim a reference for this data_vio. Necessary since another + * hash_lock might start deduplicating against it before our incRef. + */ + claimed = vdo_claim_pbn_lock_increment(pbn_lock); ASSERT_LOG_ONLY(claimed, "impossible to fail to claim an initial increment"); } diff --git a/vdo/hash-lock.h b/vdo/hash-lock.h new file mode 100644 index 00000000..3d16eb95 --- /dev/null +++ b/vdo/hash-lock.h @@ -0,0 +1,143 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright Red Hat + */ + +#ifndef HASH_LOCK_H +#define HASH_LOCK_H + +#include + +#include "completion.h" +#include "types.h" +#include "uds.h" +#include "wait-queue.h" + +enum hash_lock_state { + /* State for locks that are not in use or are being initialized. */ + VDO_HASH_LOCK_INITIALIZING, + + /* This is the sequence of states typically used on the non-dedupe path. */ + VDO_HASH_LOCK_QUERYING, + VDO_HASH_LOCK_WRITING, + VDO_HASH_LOCK_UPDATING, + + /* + * The remaining states are typically used on the dedupe path in this + * order. + */ + VDO_HASH_LOCK_LOCKING, + VDO_HASH_LOCK_VERIFYING, + VDO_HASH_LOCK_DEDUPING, + VDO_HASH_LOCK_UNLOCKING, + + /* + * XXX This is a temporary state denoting a lock which is sending VIOs + * back to the old dedupe and vioWrite pathways. It won't be in the + * final version of VDOSTORY-190. + */ + VDO_HASH_LOCK_BYPASSING, + + /* + * Terminal state for locks returning to the pool. Must be last both + * because it's the final state, and also because it's used to count + * the states. + */ + VDO_HASH_LOCK_DESTROYING, +}; + +struct hash_lock { + /* The block hash covered by this lock */ + struct uds_chunk_name hash; + + /* + * When the lock is unused, this list entry allows the lock to be + * pooled + */ + struct list_head pool_node; + + /* + * A list containing the data VIOs sharing this lock, all having the + * same chunk name and data block contents, linked by their + * hash_lock_node fields. + */ + struct list_head duplicate_ring; + + /* The number of data_vios sharing this lock instance */ + vio_count_t reference_count; + + /* The maximum value of reference_count in the lifetime of this lock */ + vio_count_t max_references; + + /* The current state of this lock */ + enum hash_lock_state state; + + /* True if the UDS index should be updated with new advice */ + bool update_advice; + + /* True if the advice has been verified to be a true duplicate */ + bool verified; + + /* + * True if the lock has already accounted for an initial verification + */ + bool verify_counted; + + /* True if this lock is registered in the lock map (cleared on + * rollover) + */ + bool registered; + + /* + * If verified is false, this is the location of a possible duplicate. + * If verified is true, it is the verified location of a true duplicate. + */ + struct zoned_pbn duplicate; + + /* The PBN lock on the block containing the duplicate data */ + struct pbn_lock *duplicate_lock; + + /* The data_vio designated to act on behalf of the lock */ + struct data_vio *agent; + + /* + * Other data_vios with data identical to the agent who are currently + * waiting for the agent to get the information they all need to + * deduplicate--either against each other, or against an existing + * duplicate on disk. + */ + struct wait_queue waiters; +}; + +/** + * vdo_initialize_hash_lock() - Initialize a hash_lock instance which + * has been newly allocated. + * @lock: The lock to initialize. + */ +static inline void vdo_initialize_hash_lock(struct hash_lock *lock) +{ + INIT_LIST_HEAD(&lock->pool_node); + INIT_LIST_HEAD(&lock->duplicate_ring); + initialize_wait_queue(&lock->waiters); +} + +const char * __must_check +vdo_get_hash_lock_state_name(enum hash_lock_state state); + +struct pbn_lock * __must_check +vdo_get_duplicate_lock(struct data_vio *data_vio); + +int __must_check vdo_acquire_hash_lock(struct data_vio *data_vio); + +void vdo_enter_hash_lock(struct data_vio *data_vio); + +void vdo_continue_hash_lock(struct data_vio *data_vio); + +void vdo_continue_hash_lock_on_error(struct data_vio *data_vio); + +void vdo_release_hash_lock(struct data_vio *data_vio); + +void vdo_share_compressed_write_lock(struct data_vio *data_vio, + struct pbn_lock *pbn_lock); + +#endif /* HASH_LOCK_H */ diff --git a/vdo/hash-utils.c b/vdo/hash-utils.c new file mode 100644 index 00000000..ddb36022 --- /dev/null +++ b/vdo/hash-utils.c @@ -0,0 +1,28 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright Red Hat + */ + +#include "hash-utils.h" + +#include "permassert.h" +#include "uds.h" + +/* Compute the number of bits required to store a given value. */ +unsigned int compute_bits(unsigned int max_value) +{ + unsigned int bits = 0; + + while (max_value > 0) { + max_value >>= 1; + bits++; + } + + return bits; +} + +/* Special function wrapper required for compile-time assertions. */ +void hash_utils_compile_time_assertions(void) +{ + STATIC_ASSERT(UDS_CHUNK_NAME_SIZE == 16); +} diff --git a/vdo/hash-utils.h b/vdo/hash-utils.h new file mode 100644 index 00000000..c86279ca --- /dev/null +++ b/vdo/hash-utils.h @@ -0,0 +1,76 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright Red Hat + */ + +#ifndef HASH_UTILS_H +#define HASH_UTILS_H 1 + +#include "compiler.h" +#include "common.h" +#include "geometry.h" +#include "numeric.h" +#include "uds.h" + +/* How various portions of a chunk name are apportioned. */ +enum { + VOLUME_INDEX_BYTES_OFFSET = 0, + VOLUME_INDEX_BYTES_COUNT = 8, + CHAPTER_INDEX_BYTES_OFFSET = 8, + CHAPTER_INDEX_BYTES_COUNT = 6, + SAMPLE_BYTES_OFFSET = 14, + SAMPLE_BYTES_COUNT = 2, +}; + +static INLINE uint64_t +extract_chapter_index_bytes(const struct uds_chunk_name *name) +{ + const byte *chapter_bits = &name->name[CHAPTER_INDEX_BYTES_OFFSET]; + uint64_t bytes = (uint64_t) get_unaligned_be16(chapter_bits) << 32; + + bytes |= get_unaligned_be32(chapter_bits + 2); + return bytes; +} + +static INLINE uint64_t +extract_volume_index_bytes(const struct uds_chunk_name *name) +{ + return get_unaligned_be64(&name->name[VOLUME_INDEX_BYTES_OFFSET]); +} + +static INLINE uint32_t +extract_sampling_bytes(const struct uds_chunk_name *name) +{ + return get_unaligned_be16(&name->name[SAMPLE_BYTES_OFFSET]); +} + +/* Compute the chapter delta list for a given name. */ +static INLINE unsigned int +hash_to_chapter_delta_list(const struct uds_chunk_name *name, + const struct geometry *geometry) +{ + return (unsigned int) ((extract_chapter_index_bytes(name) >> + geometry->chapter_address_bits) & + ((1 << geometry->chapter_delta_list_bits) - 1)); +} + +/* Compute the chapter delta address for a given name. */ +static INLINE unsigned int +hash_to_chapter_delta_address(const struct uds_chunk_name *name, + const struct geometry *geometry) +{ + return (unsigned int) (extract_chapter_index_bytes(name) & + ((1 << geometry->chapter_address_bits) - 1)); +} + +static INLINE unsigned int name_to_hash_slot(const struct uds_chunk_name *name, + unsigned int slot_count) +{ + return (unsigned int) (extract_chapter_index_bytes(name) % slot_count); +} + +unsigned int __must_check compute_bits(unsigned int max_value); + +void hash_utils_compile_time_assertions(void); + +#endif /* HASH_UTILS_H */ diff --git a/vdo/hash-zone.c b/vdo/hash-zone.c new file mode 100644 index 00000000..5f7ff584 --- /dev/null +++ b/vdo/hash-zone.c @@ -0,0 +1,417 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright Red Hat + */ + +#include "hash-zone.h" + +#include + +#include "logger.h" +#include "memory-alloc.h" +#include "numeric.h" +#include "permassert.h" + +#include "constants.h" +#include "data-vio.h" +#include "hash-lock.h" +#include "kernel-types.h" +#include "pointer-map.h" +#include "statistics.h" +#include "thread-config.h" +#include "types.h" +#include "vdo.h" + +enum { + LOCK_POOL_CAPACITY = MAXIMUM_VDO_USER_VIOS, +}; + +/** + * compare_keys() - Implements pointer_key_comparator. + */ +static bool compare_keys(const void *this_key, const void *that_key) +{ + /* Null keys are not supported. */ + return (memcmp(this_key, that_key, sizeof(struct uds_chunk_name)) == 0); +} + +/** + * hash_key() - Implements pointer_key_comparator. + */ +static uint32_t hash_key(const void *key) +{ + const struct uds_chunk_name *name = key; + /* + * Use a fragment of the chunk name as a hash code. It must not overlap + * with fragments used elsewhere to ensure uniform distributions. + */ + /* XXX pick an offset in the chunk name that isn't used elsewhere */ + return get_unaligned_le32(&name->name[4]); +} + +/** + * vdo_make_hash_zones() - Create the hash zones. + * + * @vdo: The vdo to which the zone will belong. + * @zones_ptr: A pointer to hold the zones. + * + * Return: VDO_SUCCESS or an error code. + */ +int vdo_make_hash_zones(struct vdo *vdo, struct hash_zones **zones_ptr) +{ + int result; + vio_count_t i; + struct hash_zones *zones; + zone_count_t zone_count = vdo->thread_config->hash_zone_count; + + if (zone_count == 0) { + return VDO_SUCCESS; + } + + result = UDS_ALLOCATE_EXTENDED(struct hash_zones, + zone_count, + struct hash_zone, + __func__, + &zones); + if (result != VDO_SUCCESS) { + return result; + } + + for (zones->zone_count = 0; + zones->zone_count < zone_count; + zones->zone_count++) { + struct hash_zone *zone = &zones->zones[zones->zone_count]; + + result = make_pointer_map(VDO_LOCK_MAP_CAPACITY, + 0, + compare_keys, + hash_key, + &zone->hash_lock_map); + if (result != VDO_SUCCESS) { + vdo_free_hash_zones(zones); + return result; + } + + zone->zone_number = zones->zone_count; + zone->thread_id = vdo_get_hash_zone_thread(vdo->thread_config, + zone->zone_number); + INIT_LIST_HEAD(&zone->lock_pool); + + result = UDS_ALLOCATE(LOCK_POOL_CAPACITY, + struct hash_lock, + "hash_lock array", + &zone->lock_array); + if (result != VDO_SUCCESS) { + zones->zone_count++; + vdo_free_hash_zones(zones); + return result; + } + + for (i = 0; i < LOCK_POOL_CAPACITY; i++) { + struct hash_lock *lock = &zone->lock_array[i]; + + vdo_initialize_hash_lock(lock); + list_add_tail(&lock->pool_node, &zone->lock_pool); + } + + result = vdo_make_default_thread(vdo, zone->thread_id); + } + + *zones_ptr = zones; + return VDO_SUCCESS; +} + +/** + * vdo_free_hash_zones() - Free the hash zones. + * @zones: The zone to free. + */ +void vdo_free_hash_zones(struct hash_zones *zones) +{ + zone_count_t i; + + if (zones == NULL) { + return; + } + + for (i = 0; i < zones->zone_count; i++) { + struct hash_zone *zone = &zones->zones[i]; + + free_pointer_map(UDS_FORGET(zone->hash_lock_map)); + UDS_FREE(UDS_FORGET(zone->lock_array)); + } + + UDS_FREE(zones); +} + +/** + * vdo_get_hash_zone_thread_id() - Get the ID of a hash zone's thread. + * @zone: The zone. + * + * Return: The zone's thread ID. + */ +thread_id_t vdo_get_hash_zone_thread_id(const struct hash_zone *zone) +{ + return zone->thread_id; +} + +/** + * vdo_get_hash_zone_statistics() - Get the statistics for this hash zone. + * @zone: The hash zone to query. + * + * Return: A copy of the current statistics for the hash zone. + */ +struct hash_lock_statistics +vdo_get_hash_zone_statistics(const struct hash_zone *zone) +{ + const struct hash_lock_statistics *stats = &zone->statistics; + + return (struct hash_lock_statistics) { + .dedupe_advice_valid = + READ_ONCE(stats->dedupe_advice_valid), + .dedupe_advice_stale = + READ_ONCE(stats->dedupe_advice_stale), + .concurrent_data_matches = + READ_ONCE(stats->concurrent_data_matches), + .concurrent_hash_collisions = + READ_ONCE(stats->concurrent_hash_collisions), + }; +} + +/** + * return_hash_lock_to_pool() - Return a hash lock to the zone's pool. + * @zone: The zone from which the lock was borrowed. + * @lock: The lock that is no longer in use. + */ +static void return_hash_lock_to_pool(struct hash_zone *zone, + struct hash_lock *lock) +{ + memset(lock, 0, sizeof(*lock)); + vdo_initialize_hash_lock(lock); + list_add_tail(&lock->pool_node, &zone->lock_pool); +} + +/** + * vdo_acquire_lock_from_hash_zone() - Get the lock for a chunk name. + * @zone: The zone responsible for the hash. + * @hash: The hash to lock. + * @replace_lock: If non-NULL, the lock already registered for the + * hash which should be replaced by the new lock. + * @lock_ptr: A pointer to receive the hash lock. + * + * Gets the lock for the hash (chunk name) of the data in a data_vio, or if + * one does not exist (or if we are explicitly rolling over), initialize a new + * lock for the hash and register it in the zone. This must only be called in + * the correct thread for the zone. + * + * Return: VDO_SUCCESS or an error code. + */ +int vdo_acquire_lock_from_hash_zone(struct hash_zone *zone, + const struct uds_chunk_name *hash, + struct hash_lock *replace_lock, + struct hash_lock **lock_ptr) +{ + struct hash_lock *lock, *new_lock; + + /* + * Borrow and prepare a lock from the pool so we don't have to do two + * pointer_map accesses in the common case of no lock contention. + */ + int result = ASSERT(!list_empty(&zone->lock_pool), + "never need to wait for a free hash lock"); + if (result != VDO_SUCCESS) { + return result; + } + new_lock = list_entry(zone->lock_pool.prev, struct hash_lock, + pool_node); + list_del_init(&new_lock->pool_node); + + /* + * Fill in the hash of the new lock so we can map it, since we have to + * use the hash as the map key. + */ + new_lock->hash = *hash; + + result = pointer_map_put(zone->hash_lock_map, &new_lock->hash, new_lock, + (replace_lock != NULL), (void **) &lock); + if (result != VDO_SUCCESS) { + return_hash_lock_to_pool(zone, UDS_FORGET(new_lock)); + return result; + } + + if (replace_lock != NULL) { + /* + * XXX on mismatch put the old lock back and return a severe + * error + */ + ASSERT_LOG_ONLY(lock == replace_lock, + "old lock must have been in the lock map"); + /* XXX check earlier and bail out? */ + ASSERT_LOG_ONLY(replace_lock->registered, + "old lock must have been marked registered"); + replace_lock->registered = false; + } + + if (lock == replace_lock) { + lock = new_lock; + lock->registered = true; + } else { + /* + * There's already a lock for the hash, so we don't need the + * borrowed lock. + */ + return_hash_lock_to_pool(zone, UDS_FORGET(new_lock)); + } + + *lock_ptr = lock; + return VDO_SUCCESS; +} + +/** + * vdo_return_lock_to_hash_zone() - Return a hash lock. + * @zone: The zone from which the lock was borrowed. + * @lock: The lock that is no longer in use. + * + * Returns a hash lock to the zone it was borrowed from, remove it from the + * zone's lock map, and return it to the pool. This must only be called when + * the lock has been completely released, and only in the correct thread for + * the zone. + */ +void vdo_return_lock_to_hash_zone(struct hash_zone *zone, + struct hash_lock *lock) +{ + if (lock->registered) { + struct hash_lock *removed = + pointer_map_remove(zone->hash_lock_map, &lock->hash); + ASSERT_LOG_ONLY(lock == removed, + "hash lock being released must have been mapped"); + } else { + ASSERT_LOG_ONLY(lock != pointer_map_get(zone->hash_lock_map, + &lock->hash), + "unregistered hash lock must not be in the lock map"); + } + + ASSERT_LOG_ONLY(!has_waiters(&lock->waiters), + "hash lock returned to zone must have no waiters"); + ASSERT_LOG_ONLY((lock->duplicate_lock == NULL), + "hash lock returned to zone must not reference a PBN lock"); + ASSERT_LOG_ONLY((lock->state == VDO_HASH_LOCK_DESTROYING), + "returned hash lock must not be in use with state %s", + vdo_get_hash_lock_state_name(lock->state)); + ASSERT_LOG_ONLY(list_empty(&lock->pool_node), + "hash lock returned to zone must not be in a pool ring"); + ASSERT_LOG_ONLY(list_empty(&lock->duplicate_ring), + "hash lock returned to zone must not reference DataVIOs"); + + return_hash_lock_to_pool(zone, lock); +} + +/** + * dump_hash_lock() - Dump a compact description of hash_lock to the log if + * the lock is not on the free list. + * @lock: The hash lock to dump. + */ +static void dump_hash_lock(const struct hash_lock *lock) +{ + const char *state; + + if (!list_empty(&lock->pool_node)) { + /* This lock is on the free list. */ + return; + } + + /* + * Necessarily cryptic since we can log a lot of these. First three + * chars of state is unambiguous. 'U' indicates a lock not registered in + * the map. + */ + state = vdo_get_hash_lock_state_name(lock->state); + uds_log_info(" hl %px: %3.3s %c%llu/%u rc=%u wc=%zu agt=%px", + (const void *) lock, state, (lock->registered ? 'D' : 'U'), + (unsigned long long) lock->duplicate.pbn, + lock->duplicate.state, lock->reference_count, + count_waiters(&lock->waiters), (void *) lock->agent); +} + +/** + * increment_stat() - Increment a statistic counter in a non-atomic yet + * thread-safe manner. + * @stat: The statistic field to increment. + */ +static void increment_stat(uint64_t *stat) +{ + /* + * Must only be mutated on the hash zone thread. Prevents any compiler + * shenanigans from affecting other threads reading stats. + */ + WRITE_ONCE(*stat, *stat + 1); +} + +/** + * vdo_bump_hash_zone_valid_advice_count() - Increment the valid advice count + * in the hash zone statistics. + * @zone: The hash zone of the lock that received valid advice. + * + * Context: Must only be called from the hash zone thread. + */ +void vdo_bump_hash_zone_valid_advice_count(struct hash_zone *zone) +{ + increment_stat(&zone->statistics.dedupe_advice_valid); +} + +/** + * vdo_bump_hash_zone_stale_advice_count() - Increment the stale advice count + * in the hash zone statistics. + * @zone: The hash zone of the lock that received stale advice. + * + * Context: Must only be called from the hash zone thread. + */ +void vdo_bump_hash_zone_stale_advice_count(struct hash_zone *zone) +{ + increment_stat(&zone->statistics.dedupe_advice_stale); +} + +/** + * vdo_bump_hash_zone_data_match_count() - Increment the concurrent dedupe + * count in the hash zone statistics. + * @zone: The hash zone of the lock that matched a new data_vio. + * + * Context: Must only be called from the hash zone thread. + */ +void vdo_bump_hash_zone_data_match_count(struct hash_zone *zone) +{ + increment_stat(&zone->statistics.concurrent_data_matches); +} + +/** + * vdo_bump_hash_zone_collision_count() - Increment the concurrent hash + * collision count in the hash zone + * statistics. + * @zone: The hash zone of the lock that rejected a colliding data_vio. + * + * Context: Must only be called from the hash zone thread. + */ +void vdo_bump_hash_zone_collision_count(struct hash_zone *zone) +{ + increment_stat(&zone->statistics.concurrent_hash_collisions); +} + +/** + * vdo_dump_hash_zone() - Dump information about a hash zone to the log for + * debugging. + * @zone: The zone to dump. + */ +void vdo_dump_hash_zone(const struct hash_zone *zone) +{ + vio_count_t i; + + if (zone->hash_lock_map == NULL) { + uds_log_info("struct hash_zone %u: NULL map", zone->zone_number); + return; + } + + uds_log_info("struct hash_zone %u: mapSize=%zu", zone->zone_number, + pointer_map_size(zone->hash_lock_map)); + for (i = 0; i < LOCK_POOL_CAPACITY; i++) { + dump_hash_lock(&zone->lock_array[i]); + } +} diff --git a/vdo/hash-zone.h b/vdo/hash-zone.h new file mode 100644 index 00000000..00afd0a7 --- /dev/null +++ b/vdo/hash-zone.h @@ -0,0 +1,77 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright Red Hat + */ + +#ifndef HASH_ZONE_H +#define HASH_ZONE_H + +#include + +#include "uds.h" + +#include "kernel-types.h" +#include "statistics.h" +#include "types.h" + +struct hash_zone { + /* Which hash zone this is */ + zone_count_t zone_number; + + /* The thread ID for this zone */ + thread_id_t thread_id; + + /* Mapping from chunk_name fields to hash_locks */ + struct pointer_map *hash_lock_map; + + /* List containing all unused hash_locks */ + struct list_head lock_pool; + + /* + * Statistics shared by all hash locks in this zone. Only modified on + * the hash zone thread, but queried by other threads. + */ + struct hash_lock_statistics statistics; + + /* Array of all hash_locks */ + struct hash_lock *lock_array; +}; + +struct hash_zones { + /* The number of zones */ + zone_count_t zone_count; + /* The hash zones themselves */ + struct hash_zone zones[]; +}; + +int __must_check +vdo_make_hash_zones(struct vdo *vdo, struct hash_zones **zones_ptr); + +void vdo_free_hash_zones(struct hash_zones *zones); + +thread_id_t __must_check +vdo_get_hash_zone_thread_id(const struct hash_zone *zone); + +struct hash_lock_statistics __must_check +vdo_get_hash_zone_statistics(const struct hash_zone *zone); + +int __must_check +vdo_acquire_lock_from_hash_zone(struct hash_zone *zone, + const struct uds_chunk_name *hash, + struct hash_lock *replace_lock, + struct hash_lock **lock_ptr); + +void vdo_return_lock_to_hash_zone(struct hash_zone *zone, + struct hash_lock *lock); + +void vdo_bump_hash_zone_valid_advice_count(struct hash_zone *zone); + +void vdo_bump_hash_zone_stale_advice_count(struct hash_zone *zone); + +void vdo_bump_hash_zone_data_match_count(struct hash_zone *zone); + +void vdo_bump_hash_zone_collision_count(struct hash_zone *zone); + +void vdo_dump_hash_zone(const struct hash_zone *zone); + +#endif /* HASH_ZONE_H */ diff --git a/vdo/hashLock.h b/vdo/hashLock.h deleted file mode 100644 index 69834cda..00000000 --- a/vdo/hashLock.h +++ /dev/null @@ -1,103 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/hashLock.h#9 $ - */ - -#ifndef HASH_LOCK_H -#define HASH_LOCK_H - -#include "types.h" - -/** - * Get the PBN lock on the duplicate data location for a data_vio from the - * hash_lock the data_vio holds (if there is one). - * - * @param data_vio The data_vio to query - * - * @return The PBN lock on the data_vio's duplicate location - **/ -struct pbn_lock * __must_check -get_vdo_duplicate_lock(struct data_vio *data_vio); - -/** - * Acquire or share a lock on the hash (chunk name) of the data in a data_vio, - * updating the data_vio to reference the lock. This must only be called in the - * correct thread for the zone. In the unlikely case of a hash collision, this - * function will succeed, but the data_vio will not get a lock reference. - * - * @param data_vio The data_vio acquiring a lock on its chunk name - **/ -int __must_check acquire_vdo_hash_lock(struct data_vio *data_vio); - -/** - * Asynchronously process a data_vio that has just acquired its reference to a - * hash lock. This may place the data_vio on a wait queue, or it may use the - * data_vio to perform operations on the lock's behalf. - * - * @param data_vio The data_vio that has just acquired a lock on its chunk - * name - **/ -void enter_vdo_hash_lock(struct data_vio *data_vio); - -/** - * Asynchronously continue processing a data_vio in its hash lock after it has - * finished writing, compressing, or deduplicating, so it can share the result - * with any data_vios waiting in the hash lock, or update the UDS index, or - * simply release its share of the lock. This must only be called in the - * correct thread for the hash zone. - * - * @param data_vio The data_vio to continue processing in its hash lock - **/ -void continue_vdo_hash_lock(struct data_vio *data_vio); - -/** - * Re-enter the hash lock after encountering an error, to clean up the hash - * lock. - * - * @param data_vio The data_vio with an error - **/ -void continue_vdo_hash_lock_on_error(struct data_vio *data_vio); - -/** - * Release a data_vio's share of a hash lock, if held, and null out the - * data_vio's reference to it. This must only be called in the correct thread - * for the hash zone. - * - * If the data_vio is the only one holding the lock, this also releases any - * resources or locks used by the hash lock (such as a PBN read lock on a - * block containing data with the same hash) and returns the lock to the hash - * zone's lock pool. - * - * @param data_vio The data_vio releasing its hash lock - **/ -void release_vdo_hash_lock(struct data_vio *data_vio); - -/** - * Make a data_vio's hash lock a shared holder of the PBN lock on the - * compressed block to which its data was just written. If the lock is still a - * write lock (as it will be for the first share), it will be converted to a - * read lock. This also reserves a reference count increment for the data_vio. - * - * @param data_vio The data_vio which was just compressed - * @param pbn_lock The PBN lock on the compressed block - **/ -void share_compressed_vdo_write_lock(struct data_vio *data_vio, - struct pbn_lock *pbn_lock); - -#endif // HASH_LOCK_H diff --git a/vdo/hashLockInternals.h b/vdo/hashLockInternals.h deleted file mode 100644 index ac10f642..00000000 --- a/vdo/hashLockInternals.h +++ /dev/null @@ -1,145 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/hashLockInternals.h#4 $ - */ - -#ifndef HASH_LOCK_INTERNALS_H -#define HASH_LOCK_INTERNALS_H - -#include - -#include "completion.h" -#include "types.h" -#include "uds.h" -#include "waitQueue.h" - -enum hash_lock_state { - /** State for locks that are not in use or are being initialized. */ - VDO_HASH_LOCK_INITIALIZING = 0, - - // This is the sequence of states typically used on the non-dedupe path. - VDO_HASH_LOCK_QUERYING, - VDO_HASH_LOCK_WRITING, - VDO_HASH_LOCK_UPDATING, - - // The remaining states are typically used on the dedupe path in this - // order. - VDO_HASH_LOCK_LOCKING, - VDO_HASH_LOCK_VERIFYING, - VDO_HASH_LOCK_DEDUPING, - VDO_HASH_LOCK_UNLOCKING, - - // XXX This is a temporary state denoting a lock which is sending VIOs - // back to the old dedupe and vioWrite pathways. It won't be in the - // final version of VDOSTORY-190. - VDO_HASH_LOCK_BYPASSING, - - /** - * Terminal state for locks returning to the pool. Must be last both - * because it's the final state, and also because it's used to count - * the states. - **/ - VDO_HASH_LOCK_DESTROYING, -}; - -struct hash_lock { - /** The block hash covered by this lock */ - struct uds_chunk_name hash; - - /** - * When the lock is unused, this list entry allows the lock to be - * pooled - */ - struct list_head pool_node; - - /** - * A list containing the data VIOs sharing this lock, all having the - * same chunk name and data block contents, linked by their - * hash_lock_node fields. - **/ - struct list_head duplicate_ring; - - /** The number of data_vios sharing this lock instance */ - vio_count_t reference_count; - - /** The maximum value of reference_count in the lifetime of this lock */ - vio_count_t max_references; - - /** The current state of this lock */ - enum hash_lock_state state; - - /** True if the UDS index should be updated with new advice */ - bool update_advice; - - /** True if the advice has been verified to be a true duplicate */ - bool verified; - - /** - * True if the lock has already accounted for an initial verification - */ - bool verify_counted; - - /** True if this lock is registered in the lock map (cleared on - * rollover) - */ - bool registered; - - /** - * If verified is false, this is the location of a possible duplicate. - * If verified is true, is is the verified location of a true duplicate. - **/ - struct zoned_pbn duplicate; - - /** The PBN lock on the block containing the duplicate data */ - struct pbn_lock *duplicate_lock; - - /** The data_vio designated to act on behalf of the lock */ - struct data_vio *agent; - - /** - * Other data_vios with data identical to the agent who are currently - * waiting for the agent to get the information they all need to - * deduplicate--either against each other, or against an existing - * duplicate on disk. - **/ - struct wait_queue waiters; -}; - -/** - * Initialize a hash_lock instance which has been newly allocated. - * - * @param lock The lock to initialize - **/ -static inline void initialize_vdo_hash_lock(struct hash_lock *lock) -{ - INIT_LIST_HEAD(&lock->pool_node); - INIT_LIST_HEAD(&lock->duplicate_ring); - initialize_wait_queue(&lock->waiters); -} - -/** - * Get the string representation of a hash lock state. - * - * @param state The hash lock state - * - * @return The short string representing the state - **/ -const char * __must_check get_vdo_hash_lock_state_name(enum hash_lock_state state); - -#endif // HASH_LOCK_INTERNALS_H diff --git a/vdo/hashZone.c b/vdo/hashZone.c deleted file mode 100644 index c42817fa..00000000 --- a/vdo/hashZone.c +++ /dev/null @@ -1,346 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/hashZone.c#24 $ - */ - -#include "hashZone.h" - -#include - -#include "logger.h" -#include "memoryAlloc.h" -#include "numeric.h" -#include "permassert.h" - -#include "constants.h" -#include "dataVIO.h" -#include "hashLock.h" -#include "hashLockInternals.h" -#include "pointerMap.h" -#include "statistics.h" -#include "threadConfig.h" -#include "types.h" -#include "vdoInternal.h" - -enum { - LOCK_POOL_CAPACITY = MAXIMUM_VDO_USER_VIOS, -}; - -struct hash_zone { - /** Which hash zone this is */ - zone_count_t zone_number; - - /** The thread ID for this zone */ - thread_id_t thread_id; - - /** Mapping from chunk_name fields to hash_locks */ - struct pointer_map *hash_lock_map; - - /** List containing all unused hash_locks */ - struct list_head lock_pool; - - /** - * Statistics shared by all hash locks in this zone. Only modified on - * the hash zone thread, but queried by other threads. - **/ - struct hash_lock_statistics statistics; - - /** Array of all hash_locks */ - struct hash_lock *lock_array; -}; - -/** - * Implements pointer_key_comparator. - **/ -static bool compare_keys(const void *this_key, const void *that_key) -{ - // Null keys are not supported. - return (memcmp(this_key, that_key, sizeof(struct uds_chunk_name)) == 0); -} - -/** - * Implements pointer_key_comparator. - **/ -static uint32_t hash_key(const void *key) -{ - const struct uds_chunk_name *name = key; - /* - * Use a fragment of the chunk name as a hash code. It must not overlap - * with fragments used elsewhere to ensure uniform distributions. - */ - // XXX pick an offset in the chunk name that isn't used elsewhere - return get_unaligned_le32(&name->name[4]); -} - -/**********************************************************************/ -int make_vdo_hash_zone(struct vdo *vdo, zone_count_t zone_number, - struct hash_zone **zone_ptr) -{ - vio_count_t i; - struct hash_zone *zone; - int result = UDS_ALLOCATE(1, struct hash_zone, __func__, &zone); - if (result != VDO_SUCCESS) { - return result; - } - - result = make_pointer_map(VDO_LOCK_MAP_CAPACITY, 0, compare_keys, - hash_key, &zone->hash_lock_map); - if (result != VDO_SUCCESS) { - free_vdo_hash_zone(zone); - return result; - } - - zone->zone_number = zone_number; - zone->thread_id = vdo_get_hash_zone_thread(get_vdo_thread_config(vdo), - zone_number); - INIT_LIST_HEAD(&zone->lock_pool); - - result = UDS_ALLOCATE(LOCK_POOL_CAPACITY, struct hash_lock, - "hash_lock array", &zone->lock_array); - if (result != VDO_SUCCESS) { - free_vdo_hash_zone(zone); - return result; - } - - for (i = 0; i < LOCK_POOL_CAPACITY; i++) { - struct hash_lock *lock = &zone->lock_array[i]; - initialize_vdo_hash_lock(lock); - list_add_tail(&lock->pool_node, &zone->lock_pool); - } - - *zone_ptr = zone; - return VDO_SUCCESS; -} - -/**********************************************************************/ -void free_vdo_hash_zone(struct hash_zone *zone) -{ - if (zone == NULL) { - return; - } - - free_pointer_map(UDS_FORGET(zone->hash_lock_map)); - UDS_FREE(UDS_FORGET(zone->lock_array)); - UDS_FREE(zone); -} - -/**********************************************************************/ -zone_count_t get_vdo_hash_zone_number(const struct hash_zone *zone) -{ - return zone->zone_number; -} - -/**********************************************************************/ -thread_id_t get_vdo_hash_zone_thread_id(const struct hash_zone *zone) -{ - return zone->thread_id; -} - -/**********************************************************************/ -struct hash_lock_statistics -get_vdo_hash_zone_statistics(const struct hash_zone *zone) -{ - const struct hash_lock_statistics *stats = &zone->statistics; - return (struct hash_lock_statistics) { - .dedupe_advice_valid = - READ_ONCE(stats->dedupe_advice_valid), - .dedupe_advice_stale = - READ_ONCE(stats->dedupe_advice_stale), - .concurrent_data_matches = - READ_ONCE(stats->concurrent_data_matches), - .concurrent_hash_collisions = - READ_ONCE(stats->concurrent_hash_collisions), - }; -} - -/** - * Return a hash lock to the zone's pool. - * - * @param zone The zone from which the lock was borrowed - * @param lock The lock that is no longer in use - **/ -static void return_hash_lock_to_pool(struct hash_zone *zone, - struct hash_lock *lock) -{ - memset(lock, 0, sizeof(*lock)); - initialize_vdo_hash_lock(lock); - list_add_tail(&lock->pool_node, &zone->lock_pool); -} - -/**********************************************************************/ -int acquire_lock_from_vdo_hash_zone(struct hash_zone *zone, - const struct uds_chunk_name *hash, - struct hash_lock *replace_lock, - struct hash_lock **lock_ptr) -{ - struct hash_lock *lock, *new_lock; - - // Borrow and prepare a lock from the pool so we don't have to do two - // pointer_map accesses in the common case of no lock contention. - int result = ASSERT(!list_empty(&zone->lock_pool), - "never need to wait for a free hash lock"); - if (result != VDO_SUCCESS) { - return result; - } - new_lock = list_entry(zone->lock_pool.prev, struct hash_lock, - pool_node); - list_del_init(&new_lock->pool_node); - - // Fill in the hash of the new lock so we can map it, since we have to - // use the hash as the map key. - new_lock->hash = *hash; - - result = pointer_map_put(zone->hash_lock_map, &new_lock->hash, new_lock, - (replace_lock != NULL), (void **) &lock); - if (result != VDO_SUCCESS) { - return_hash_lock_to_pool(zone, UDS_FORGET(new_lock)); - return result; - } - - if (replace_lock != NULL) { - // XXX on mismatch put the old lock back and return a severe - // error - ASSERT_LOG_ONLY(lock == replace_lock, - "old lock must have been in the lock map"); - // XXX check earlier and bail out? - ASSERT_LOG_ONLY(replace_lock->registered, - "old lock must have been marked registered"); - replace_lock->registered = false; - } - - if (lock == replace_lock) { - lock = new_lock; - lock->registered = true; - } else { - // There's already a lock for the hash, so we don't need the - // borrowed lock. - return_hash_lock_to_pool(zone, UDS_FORGET(new_lock)); - } - - *lock_ptr = lock; - return VDO_SUCCESS; -} - -/**********************************************************************/ -void return_lock_to_vdo_hash_zone(struct hash_zone *zone, - struct hash_lock *lock) -{ - if (lock->registered) { - struct hash_lock *removed = - pointer_map_remove(zone->hash_lock_map, &lock->hash); - ASSERT_LOG_ONLY(lock == removed, - "hash lock being released must have been mapped"); - } else { - ASSERT_LOG_ONLY(lock != pointer_map_get(zone->hash_lock_map, - &lock->hash), - "unregistered hash lock must not be in the lock map"); - } - - ASSERT_LOG_ONLY(!has_waiters(&lock->waiters), - "hash lock returned to zone must have no waiters"); - ASSERT_LOG_ONLY((lock->duplicate_lock == NULL), - "hash lock returned to zone must not reference a PBN lock"); - ASSERT_LOG_ONLY((lock->state == VDO_HASH_LOCK_DESTROYING), - "returned hash lock must not be in use with state %s", - get_vdo_hash_lock_state_name(lock->state)); - ASSERT_LOG_ONLY(list_empty(&lock->pool_node), - "hash lock returned to zone must not be in a pool ring"); - ASSERT_LOG_ONLY(list_empty(&lock->duplicate_ring), - "hash lock returned to zone must not reference DataVIOs"); - - return_hash_lock_to_pool(zone, lock); -} - -/** - * Dump a compact description of hash_lock to the log if the lock is not on the - * free list. - * - * @param lock The hash lock to dump - **/ -static void dump_hash_lock(const struct hash_lock *lock) -{ - const char *state; - - if (!list_empty(&lock->pool_node)) { - // This lock is on the free list. - return; - } - - // Necessarily cryptic since we can log a lot of these. First three - // chars of state is unambiguous. 'U' indicates a lock not registered in - // the map. - state = get_vdo_hash_lock_state_name(lock->state); - uds_log_info(" hl %px: %3.3s %c%llu/%u rc=%u wc=%zu agt=%px", - (const void *) lock, state, (lock->registered ? 'D' : 'U'), - (unsigned long long) lock->duplicate.pbn, - lock->duplicate.state, lock->reference_count, - count_waiters(&lock->waiters), (void *) lock->agent); -} - -/** - * Increment a statistic counter in a non-atomic yet thread-safe manner. - * - * @param stat The statistic field to increment - **/ -static void increment_stat(uint64_t *stat) -{ - // Must only be mutated on the hash zone thread. Prevents any compiler - // shenanigans from affecting other threads reading stats. - WRITE_ONCE(*stat, *stat + 1); -} - -/**********************************************************************/ -void bump_vdo_hash_zone_valid_advice_count(struct hash_zone *zone) -{ - increment_stat(&zone->statistics.dedupe_advice_valid); -} - -/**********************************************************************/ -void bump_vdo_hash_zone_stale_advice_count(struct hash_zone *zone) -{ - increment_stat(&zone->statistics.dedupe_advice_stale); -} - -/**********************************************************************/ -void bump_vdo_hash_zone_data_match_count(struct hash_zone *zone) -{ - increment_stat(&zone->statistics.concurrent_data_matches); -} - -/**********************************************************************/ -void bump_vdo_hash_zone_collision_count(struct hash_zone *zone) -{ - increment_stat(&zone->statistics.concurrent_hash_collisions); -} - -/**********************************************************************/ -void dump_vdo_hash_zone(const struct hash_zone *zone) -{ - vio_count_t i; - if (zone->hash_lock_map == NULL) { - uds_log_info("struct hash_zone %u: NULL map", zone->zone_number); - return; - } - - uds_log_info("struct hash_zone %u: mapSize=%zu", zone->zone_number, - pointer_map_size(zone->hash_lock_map)); - for (i = 0; i < LOCK_POOL_CAPACITY; i++) { - dump_hash_lock(&zone->lock_array[i]); - } -} diff --git a/vdo/hashZone.h b/vdo/hashZone.h deleted file mode 100644 index aacd0da6..00000000 --- a/vdo/hashZone.h +++ /dev/null @@ -1,151 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/hashZone.h#10 $ - */ - -#ifndef HASH_ZONE_H -#define HASH_ZONE_H - -#include "uds.h" - -#include "statistics.h" -#include "types.h" - -/** - * Create a hash zone. - * - * @param [in] vdo The vdo to which the zone will belong - * @param [in] zone_number The number of the zone to create - * @param [out] zone_ptr A pointer to hold the new hash_zone - * - * @return VDO_SUCCESS or an error code - **/ -int __must_check make_vdo_hash_zone(struct vdo *vdo, - zone_count_t zone_number, - struct hash_zone **zone_ptr); - -/** - * Free a hash zone. - * - * @param zone The zone to free - **/ -void free_vdo_hash_zone(struct hash_zone *zone); - -/** - * Get the zone number of a hash zone. - * - * @param zone The zone - * - * @return The number of the zone - **/ -zone_count_t __must_check -get_vdo_hash_zone_number(const struct hash_zone *zone); - -/** - * Get the ID of a hash zone's thread. - * - * @param zone The zone - * - * @return The zone's thread ID - **/ -thread_id_t __must_check -get_vdo_hash_zone_thread_id(const struct hash_zone *zone); - -/** - * Get the statistics for this hash zone. - * - * @param zone The hash zone to query - * - * @return A copy of the current statistics for the hash zone - **/ -struct hash_lock_statistics __must_check -get_vdo_hash_zone_statistics(const struct hash_zone *zone); - -/** - * Get the lock for the hash (chunk name) of the data in a data_vio, or if one - * does not exist (or if we are explicitly rolling over), initialize a new - * lock for the hash and register it in the zone. This must only be called in - * the correct thread for the zone. - * - * @param [in] zone The zone responsible for the hash - * @param [in] hash The hash to lock - * @param [in] replace_lock If non-NULL, the lock already registered for the - * hash which should be replaced by the new lock - * @param [out] lock_ptr A pointer to receive the hash lock - * - * @return VDO_SUCCESS or an error code - **/ -int __must_check -acquire_lock_from_vdo_hash_zone(struct hash_zone *zone, - const struct uds_chunk_name *hash, - struct hash_lock *replace_lock, - struct hash_lock **lock_ptr); - -/** - * Return a hash lock to the zone it was borrowed from, remove it from the - * zone's lock map, and return it to the pool. This must only be called when - * the lock has been completely released, and only in the correct thread for - * the zone. - * - * @param zone The zone from which the lock was borrowed - * @param lock The lock that is no longer in use - **/ -void return_lock_to_vdo_hash_zone(struct hash_zone *zone, - struct hash_lock *lock); - -/** - * Increment the valid advice count in the hash zone statistics. - * Must only be called from the hash zone thread. - * - * @param zone The hash zone of the lock that received valid advice - **/ -void bump_vdo_hash_zone_valid_advice_count(struct hash_zone *zone); - -/** - * Increment the stale advice count in the hash zone statistics. - * Must only be called from the hash zone thread. - * - * @param zone The hash zone of the lock that received stale advice - **/ -void bump_vdo_hash_zone_stale_advice_count(struct hash_zone *zone); - -/** - * Increment the concurrent dedupe count in the hash zone statistics. - * Must only be called from the hash zone thread. - * - * @param zone The hash zone of the lock that matched a new data_vio - **/ -void bump_vdo_hash_zone_data_match_count(struct hash_zone *zone); - -/** - * Increment the concurrent hash collision count in the hash zone statistics. - * Must only be called from the hash zone thread. - * - * @param zone The hash zone of the lock that rejected a colliding data_vio - **/ -void bump_vdo_hash_zone_collision_count(struct hash_zone *zone); - -/** - * Dump information about a hash zone to the log for debugging. - * - * @param zone The zone to dump - **/ -void dump_vdo_hash_zone(const struct hash_zone *zone); - -#endif // HASH_ZONE_H diff --git a/vdo/header.c b/vdo/header.c index dbddee22..34ada4b2 100644 --- a/vdo/header.c +++ b/vdo/header.c @@ -1,36 +1,32 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/header.c#10 $ */ #include "header.h" #include "logger.h" #include "permassert.h" -#include "statusCodes.h" - -/**********************************************************************/ -int validate_vdo_version(struct version_number expected_version, +#include "status-codes.h" + +/** + * vdo_validate_version() - Check whether a version matches an expected + * version. + * @expected_version: The expected version. + * @actual_version: The version being validated. + * @component_name: The name of the component or the calling function + * (for error logging). + * + * Logs an error describing a mismatch. + * + * Return: VDO_SUCCESS if the versions are the same, + * VDO_UNSUPPORTED_VERSION if the versions don't match. + */ +int vdo_validate_version(struct version_number expected_version, struct version_number actual_version, const char *component_name) { - if (!are_same_vdo_version(expected_version, actual_version)) { + if (!vdo_are_same_version(expected_version, actual_version)) { return uds_log_error_strerror(VDO_UNSUPPORTED_VERSION, "%s version mismatch, expected %d.%d, got %d.%d", component_name, @@ -42,13 +38,29 @@ int validate_vdo_version(struct version_number expected_version, return VDO_SUCCESS; } -/**********************************************************************/ -int validate_vdo_header(const struct header *expected_header, +/** + * vdo_validate_header() - Check whether a header matches expectations. + * @expected_header: The expected header. + * @actual_header: The header being validated. + * @exact_size: If true, the size fields of the two headers must be the same, + * otherwise it is required that actual_header.size >= + * expected_header.size. + * @component_name: The name of the component or the calling function + * (for error logging). + * + * Logs an error describing the first mismatch found. + * + * Return: VDO_SUCCESS if the header meets expectations, + * VDO_INCORRECT_COMPONENT if the component ids don't match, + * VDO_UNSUPPORTED_VERSION if the versions or sizes don't match. + */ +int vdo_validate_header(const struct header *expected_header, const struct header *actual_header, bool exact_size, const char *component_name) { int result; + if (expected_header->id != actual_header->id) { return uds_log_error_strerror(VDO_INCORRECT_COMPONENT, "%s ID mismatch, expected %d, got %d", @@ -57,7 +69,7 @@ int validate_vdo_header(const struct header *expected_header, actual_header->id); } - result = validate_vdo_version(expected_header->version, + result = vdo_validate_version(expected_header->version, actual_header->version, component_name); if (result != VDO_SUCCESS) { return result; @@ -75,8 +87,14 @@ int validate_vdo_header(const struct header *expected_header, return VDO_SUCCESS; } -/**********************************************************************/ -int encode_vdo_header(const struct header *header, struct buffer *buffer) +/** + * vdo_encode_header() - Encode a header into a buffer. + * @header: The header to encode. + * @buffer: The buffer in which to encode the header. + * + * Return: UDS_SUCCESS or an error. + */ +int vdo_encode_header(const struct header *header, struct buffer *buffer) { int result; @@ -89,7 +107,7 @@ int encode_vdo_header(const struct header *header, struct buffer *buffer) return result; } - result = encode_vdo_version_number(header->version, buffer); + result = vdo_encode_version_number(header->version, buffer); if (result != UDS_SUCCESS) { return result; } @@ -97,27 +115,41 @@ int encode_vdo_header(const struct header *header, struct buffer *buffer) return put_uint64_le_into_buffer(buffer, header->size); } -/**********************************************************************/ -int encode_vdo_version_number(struct version_number version, +/** + * vdo_encode_version_number() - Encode a version number into a buffer. + * @version: The version to encode. + * @buffer: The buffer in which to encode the version. + * + * Return: UDS_SUCCESS or an error. + */ +int vdo_encode_version_number(struct version_number version, struct buffer *buffer) { - struct packed_version_number packed = pack_vdo_version_number(version); + struct packed_version_number packed = vdo_pack_version_number(version); + return put_bytes(buffer, sizeof(packed), &packed); } -/**********************************************************************/ -int decode_vdo_header(struct buffer *buffer, struct header *header) +/** + * vdo_decode_header() - Decode a header from a buffer. + * @buffer: The buffer from which to decode the header. + * @header: The header structure to decode into. + * + * Return: UDS_SUCCESS or an error. + */ +int vdo_decode_header(struct buffer *buffer, struct header *header) { - enum component_id id; + uint32_t id; uint64_t size; struct version_number version; int result = get_uint32_le_from_buffer(buffer, &id); + if (result != UDS_SUCCESS) { return result; } - result = decode_vdo_version_number(buffer, &version); + result = vdo_decode_version_number(buffer, &version); if (result != UDS_SUCCESS) { return result; } @@ -135,16 +167,23 @@ int decode_vdo_header(struct buffer *buffer, struct header *header) return UDS_SUCCESS; } -/**********************************************************************/ -int decode_vdo_version_number(struct buffer *buffer, +/** + * vdo_decode_version_number() - Decode a version number from a buffer. + * @buffer: The buffer from which to decode the version. + * @version: The version structure to decode into. + * + * Return: UDS_SUCCESS or an error. + */ +int vdo_decode_version_number(struct buffer *buffer, struct version_number *version) { struct packed_version_number packed; int result = get_bytes_from_buffer(buffer, sizeof(packed), &packed); + if (result != UDS_SUCCESS) { return result; } - *version = unpack_vdo_version_number(packed); + *version = vdo_unpack_version_number(packed); return UDS_SUCCESS; } diff --git a/vdo/header.h b/vdo/header.h index d511172b..890eb089 100644 --- a/vdo/header.h +++ b/vdo/header.h @@ -1,22 +1,6 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/header.h#8 $ */ #ifndef HEADER_H @@ -27,7 +11,7 @@ #include "types.h" -/** +/* * An in-memory representation of a version number for versioned structures on * disk. * @@ -38,40 +22,38 @@ * upgrade step, or is wholly incompatible (i.e. can not be upgraded * to), should increment the major version, and set the minor version * to 0. - **/ + */ struct version_number { uint32_t major_version; uint32_t minor_version; } __packed; -/** +/* * A packed, machine-independent, on-disk representation of a version_number. * Both fields are stored in little-endian byte order. - **/ + */ struct packed_version_number { __le32 major_version; __le32 minor_version; } __packed; -/** +/* * The registry of component ids for use in headers - **/ -enum component_id { - VDO_SUPER_BLOCK = 0, - VDO_FIXED_LAYOUT = 1, - VDO_RECOVERY_JOURNAL = 2, - VDO_SLAB_DEPOT = 3, - VDO_BLOCK_MAP = 4, - VDO_GEOMETRY_BLOCK = 5, -}; + */ +#define VDO_SUPER_BLOCK 0 +#define VDO_FIXED_LAYOUT 1 +#define VDO_RECOVERY_JOURNAL 2 +#define VDO_SLAB_DEPOT 3 +#define VDO_BLOCK_MAP 4 +#define VDO_GEOMETRY_BLOCK 5 -/** +/* * The header for versioned data stored on disk. - **/ + */ struct header { - enum component_id id; // The component this is a header for - struct version_number version; // The version of the data format - size_t size; // The size of the data following this header + uint32_t id; /* The component this is a header for */ + struct version_number version; /* The version of the data format */ + size_t size; /* The size of the data following this header */ } __packed; enum { @@ -79,14 +61,13 @@ enum { }; /** - * Check whether two version numbers are the same. - * - * @param version_a The first version - * @param version_b The second version + * vdo_are_same_version() - Check whether two version numbers are the same. + * @version_a: The first version. + * @version_b: The second version. * - * @return true if the two versions are the same - **/ -static inline bool are_same_vdo_version(struct version_number version_a, + * Return: true if the two versions are the same. + */ +static inline bool vdo_are_same_version(struct version_number version_a, struct version_number version_b) { return ((version_a.major_version == version_b.major_version) @@ -94,114 +75,55 @@ static inline bool are_same_vdo_version(struct version_number version_a, } /** - * Check whether an actual version is upgradable to an expected version. - * An actual version is upgradable if its major number is expected but - * its minor number differs, and the expected version's minor number - * is greater than the actual version's minor number. + * vdo_is_upgradable_version() - Check whether an actual version is upgradable + * to an expected version. + * @expected_version: The expected version. + * @actual_version: The version being validated. * - * @param expected_version The expected version - * @param actual_version The version being validated + * An actual version is upgradable if its major number is expected but its + * minor number differs, and the expected version's minor number is greater + * than the actual version's minor number. * - * @return true if the actual version is upgradable - **/ + * Return: true if the actual version is upgradable. + */ static inline bool -is_upgradable_vdo_version(struct version_number expected_version, +vdo_is_upgradable_version(struct version_number expected_version, struct version_number actual_version) { return ((expected_version.major_version == actual_version.major_version) && (expected_version.minor_version > actual_version.minor_version)); } -/** - * Check whether a version matches an expected version. Logs an error - * describing a mismatch. - * - * @param expected_version The expected version - * @param actual_version The version being validated - * @param component_name The name of the component or the calling function - * (for error logging) - * - * @return VDO_SUCCESS if the versions are the same - * VDO_UNSUPPORTED_VERSION if the versions don't match - **/ -int __must_check validate_vdo_version(struct version_number expected_version, +int __must_check vdo_validate_version(struct version_number expected_version, struct version_number actual_version, const char *component_name); -/** - * Check whether a header matches expectations. Logs an error describing the - * first mismatch found. - * - * @param expected_header The expected header - * @param actual_header The header being validated - * @param exact_size If true, the size fields of the two headers must be - * the same, otherwise it is required that - * actual_header.size >= expected_header.size - * @param component_name The name of the component or the calling function - * (for error logging) - * - * @return VDO_SUCCESS if the header meets expectations - * VDO_INCORRECT_COMPONENT if the component ids don't match - * VDO_UNSUPPORTED_VERSION if the versions or sizes don't match - **/ -int __must_check validate_vdo_header(const struct header *expected_header, +int __must_check vdo_validate_header(const struct header *expected_header, const struct header *actual_header, bool exact_size, const char *component_name); -/** - * Encode a header into a buffer. - * - * @param header The header to encode - * @param buffer The buffer in which to encode the header - * - * @return UDS_SUCCESS or an error - **/ int __must_check -encode_vdo_header(const struct header *header, struct buffer *buffer); +vdo_encode_header(const struct header *header, struct buffer *buffer); -/** - * Encode a version number into a buffer. - * - * @param version The version to encode - * @param buffer The buffer in which to encode the version - * - * @return UDS_SUCCESS or an error - **/ -int __must_check encode_vdo_version_number(struct version_number version, +int __must_check vdo_encode_version_number(struct version_number version, struct buffer *buffer); -/** - * Decode a header from a buffer. - * - * @param [in] buffer The buffer from which to decode the header - * @param [out] header The header to decode - * - * @return UDS_SUCCESS or an error - **/ -int __must_check decode_vdo_header(struct buffer *buffer, +int __must_check vdo_decode_header(struct buffer *buffer, struct header *header); -/** - * Decode a version number from a buffer. - * - * @param buffer The buffer from which to decode the version - * @param version The version structure to decode into - * - * @return UDS_SUCCESS or an error - **/ -int __must_check decode_vdo_version_number(struct buffer *buffer, +int __must_check vdo_decode_version_number(struct buffer *buffer, struct version_number *version); -/** +/* * Convert a version_number to its packed on-disk representation. * * @param version The version number to convert * * @return the platform-independent representation of the version - **/ + */ static inline struct packed_version_number -pack_vdo_version_number(struct version_number version) +vdo_pack_version_number(struct version_number version) { return (struct packed_version_number) { .major_version = __cpu_to_le32(version.major_version), @@ -210,14 +132,14 @@ pack_vdo_version_number(struct version_number version) } /** - * Convert a packed_version_number to its native in-memory representation. - * - * @param version The version number to convert + * vdo_unpack_version_number() - Convert a packed_version_number to its native + * in-memory representation. + * @version: The version number to convert. * - * @return the platform-independent representation of the version - **/ + * Return: The platform-independent representation of the version. + */ static inline struct version_number -unpack_vdo_version_number(struct packed_version_number version) +vdo_unpack_version_number(struct packed_version_number version) { return (struct version_number) { .major_version = __le32_to_cpu(version.major_version), @@ -225,4 +147,4 @@ unpack_vdo_version_number(struct packed_version_number version) }; } -#endif // HEADER_H +#endif /* HEADER_H */ diff --git a/vdo/heap.c b/vdo/heap.c index 8caa96de..1fd14a68 100644 --- a/vdo/heap.c +++ b/vdo/heap.c @@ -1,33 +1,30 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/heap.c#7 $ */ #include "heap.h" + #include "errors.h" #include "logger.h" #include "numeric.h" -#include "statusCodes.h" +#include "status-codes.h" -/**********************************************************************/ +/** + * initialize_heap() - Initialize an binary heap by wrapping it around + * an array of elements. + * @heap: The heap to initialize. + * @comparator: The function to use to compare two heap elements. + * @swapper: The function to use to swap two heap elements. + * @array: The array of elements (not modified by this call). + * @capacity: The maximum number of elements which fit in the array. + * @element_size: The size of every array element, in bytes. + * + * The heap will not own the array it wraps. Use build_heap() subsequently to + * arrange any elements contained in the array into a valid heap. + */ void initialize_heap(struct heap *heap, heap_comparator *comparator, heap_swapper *swapper, void *array, size_t capacity, size_t element_size) @@ -39,23 +36,29 @@ void initialize_heap(struct heap *heap, heap_comparator *comparator, .element_size = element_size, }; if (array != NULL) { - // Calculating child indexes is simplified by pretending the - // element array is 1-based. + /* + * Calculating child indexes is simplified by pretending the + * element array is 1-based. + */ heap->array = ((byte *) array - element_size); } } -/**********************************************************************/ static void sift_heap_down(struct heap *heap, size_t top_node, size_t last_node) { - // Keep sifting until the sub-heap rooted at top_node has no children. + /* Keep sifting until the sub-heap rooted at top_node has no children. */ size_t left_child; + while ((left_child = (2 * top_node)) <= last_node) { - // If there are two children, select the largest child to swap - // with. + /* + * If there are two children, select the largest child to swap + * with. + */ size_t swap_node = left_child; + if (left_child < last_node) { size_t right_child = left_child + heap->element_size; + if (heap->comparator(&heap->array[left_child], &heap->array[right_child]) < 0) { @@ -63,29 +66,46 @@ static void sift_heap_down(struct heap *heap, size_t top_node, size_t last_node) } } - // Stop sifting if top_node is at least as large as its largest - // child, which means the heap invariant was restored by the - // previous swap. + /* + * Stop sifting if top_node is at least as large as its largest + * child, which means the heap invariant was restored by the + * previous swap. + */ if (heap->comparator(&heap->array[top_node], &heap->array[swap_node]) >= 0) { return; } - // Swap the element we've been sifting down with the larger - // child. + /* + * Swap the element we've been sifting down with the larger + * child. + */ heap->swapper(&heap->array[top_node], &heap->array[swap_node]); - // Descend into the sub-heap rooted at that child, going around - // the loop again in place of a tail-recursive call to - // sift_heap_down(). + /* + * Descend into the sub-heap rooted at that child, going around + * the loop again in place of a tail-recursive call to + * sift_heap_down(). + */ top_node = swap_node; } - // We sifted the element all the way to a leaf node of the heap, so the - // heap invariant has now been restored. + /* + * We sifted the element all the way to a leaf node of the heap, so the + * heap invariant has now been restored. + */ } -/**********************************************************************/ +/** + * build_heap() - Build a max-heap in place in an array (heapify it) by + * re-ordering the elements to establish the heap invariant. + * @heap: The heap to build. + * @count: The number of elements in the array to build into a heap. + * + * Before calling this function, first copy the elements to be arranged into a + * heap into the array that was passed to initialize_heap(). This operation + * has O(N) time complexity in the number of elements in the array. + */ void build_heap(struct heap *heap, size_t count) { size_t size, last_parent, last_node, top_node; @@ -122,7 +142,18 @@ void build_heap(struct heap *heap, size_t count) } } -/**********************************************************************/ +/** + * pop_max_heap_element() - Remove the largest element from the top of the + * heap and restore the heap invariant on the + * remaining elements. + * @heap: The heap to modify. + * @element_ptr: A pointer to receive the largest element (may be NULL if the + * caller just wishes to discard it) + * + * This operation has O(log2(N)) time complexity. + * + * Return: false if the heap was empty, so no element was removed. + */ bool pop_max_heap_element(struct heap *heap, void *element_ptr) { size_t root_node, last_node; @@ -134,14 +165,18 @@ bool pop_max_heap_element(struct heap *heap, void *element_ptr) root_node = (heap->element_size * 1); last_node = (heap->element_size * heap->count); - // Return the maximum element (the root of the heap) if the caller - // wanted it. + /* + * Return the maximum element (the root of the heap) if the caller + * wanted it. + */ if (element_ptr != NULL) { memcpy(element_ptr, &heap->array[root_node], heap->element_size); } - // Move the right-most leaf node to the vacated root node, reducing the - // number of elements by one and violating the heap invariant. + /* + * Move the right-most leaf node to the vacated root node, reducing the + * number of elements by one and violating the heap invariant. + */ if (root_node != last_node) { memcpy(&heap->array[root_node], &heap->array[last_node], heap->element_size); @@ -149,13 +184,14 @@ bool pop_max_heap_element(struct heap *heap, void *element_ptr) heap->count -= 1; last_node -= heap->element_size; - // Restore the heap invariant by sifting the root back down into the - // heap. + /* + * Restore the heap invariant by sifting the root back down into the + * heap. + */ sift_heap_down(heap, root_node, last_node); return true; } -/**********************************************************************/ static inline size_t sift_and_sort(struct heap *heap, size_t root_node, size_t last_node) { @@ -164,32 +200,53 @@ static inline size_t sift_and_sort(struct heap *heap, size_t root_node, * top of the heap. That element belongs at the start of the * partially-sorted array, preceding all the larger elements that we've * already removed from the heap. Swap that largest unsorted element - * with the the right-most leaf node in the heap, moving it to its + * with the right-most leaf node in the heap, moving it to its * sorted position in the array. */ heap->swapper(&heap->array[root_node], &heap->array[last_node]); - // The sorted list is now one element larger and valid. The heap is - // one element smaller, and invalid. + /* + * The sorted list is now one element larger and valid. The heap is + * one element smaller, and invalid. + */ last_node -= heap->element_size; - // Restore the heap invariant by sifting the swapped element back down - // into the heap. + /* + * Restore the heap invariant by sifting the swapped element back down + * into the heap. + */ sift_heap_down(heap, root_node, last_node); return last_node; } -/**********************************************************************/ +/** + * sort_heap() - Sort the elements contained in a heap. + * @heap: The heap containing the elements to sort. + * + * This function re-orders the elements contained in the heap to a sorted + * array in-place by repeatedly popping the maximum element off the heap and + * moving it to the spot vacated at the end of the heap array. When the + * function returns, the heap will be empty and the array will contain the + * elements in sorted order, from heap minimum to heap maximum. The sort is + * unstable--relative ordering of equal keys is not preserved. This operation + * has O(N*log2(N)) time complexity. + * + * Return: the number of elements that were sorted + */ size_t sort_heap(struct heap *heap) { size_t root_node, last_node, count; - // All zero-length records are identical and therefore already sorted, - // as are empty or singleton arrays. + /* + * All zero-length records are identical and therefore already sorted, + * as are empty or singleton arrays. + */ if ((heap->count < 2) || (heap->element_size == 0)) { return heap->count; } - // Get the byte array offset of the root node, and the right-most leaf - // node in the 1-based array of records that will form the heap. + /* + * Get the byte array offset of the root node, and the right-most leaf + * node in the 1-based array of records that will form the heap. + */ root_node = (heap->element_size * 1); last_node = (heap->element_size * heap->count); @@ -202,7 +259,13 @@ size_t sort_heap(struct heap *heap) return count; } -/**********************************************************************/ +/** + * sort_next_heap_element() - Gets the next sorted heap element and returns a + * pointer to it, in O(log2(N)) time. + * @heap: The heap to sort one more step. + * + * Return: a pointer to the element sorted, or NULL if already fully sorted. + */ void *sort_next_heap_element(struct heap *heap) { size_t root_node, last_node; @@ -211,8 +274,10 @@ void *sort_next_heap_element(struct heap *heap) return NULL; } - // Get the byte array offset of the root node, and the right-most leaf - // node in the 1-based array of records that will form the heap. + /* + * Get the byte array offset of the root node, and the right-most leaf + * node in the 1-based array of records that will form the heap. + */ root_node = (heap->element_size * 1); last_node = (heap->element_size * heap->count); if (heap->count > 1) { diff --git a/vdo/heap.h b/vdo/heap.h index 4d33b2fd..a3761b1e 100644 --- a/vdo/heap.h +++ b/vdo/heap.h @@ -1,152 +1,79 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/heap.h#5 $ */ #ifndef HEAP_H #define HEAP_H -#include "common.h" +#include "type-defs.h" /** - * Prototype for functions which compare two array elements. All the time - * complexity claims in this module assume this operation has O(1) time - * complexity. + * typedef heap_comparator - Prototype for functions which compare two array + * elements. + * @item1: The first element to compare. + * @item2: The second element to compare. * - * @param item1 The first element to compare - * @param item2 The second element to compare + * All the time complexity claims in this module assume this operation has + * O(1) time complexity. * - * @return An integer which is less than, equal to, or greater than 0 + * Return: An integer which is less than, equal to, or greater than 0 * depending on whether item1 is less than, equal to, or greater * than item2, respectively - **/ + */ typedef int heap_comparator(const void *item1, const void *item2); /** - * Prototype for functions which swap two array elements. - * - * @param item1 The first element to swap - * @param item2 The second element to swap - **/ + * typedef heap_swapper - Prototype for functions which swap two array + * elements. + * @item1: The first element to swap. + * @item2: The second element to swap. + */ typedef void heap_swapper(void *item1, void *item2); -/** +/* * A heap array can be any array of fixed-length elements in which the heap * invariant can be established. In a max-heap, every child of a node must be * at least as large as its children. Once that invariant is established in an * array by calling build_heap(), all the other heap operations may be used on * that array. - **/ + */ struct heap { - /** the 1-based array of heap elements (nodes) */ + /* the 1-based array of heap elements (nodes) */ byte *array; - /** the function to use to compare two elements */ + /* the function to use to compare two elements */ heap_comparator *comparator; - /** the function to use to swap two elements */ + /* the function to use to swap two elements */ heap_swapper *swapper; - /** the maximum number of elements that can be stored */ + /* the maximum number of elements that can be stored */ size_t capacity; - /** the size of every element (in bytes) */ + /* the size of every element (in bytes) */ size_t element_size; - /** the current number of elements in the heap */ + /* the current number of elements in the heap */ size_t count; }; -/** - * Initialize an binary heap by wrapping it around an array of elements. - * - * The heap will not own the array it wraps. Use build_heap() subsequently to - * arrange any elements contained in the array into a valid heap. - * - * @param heap The heap to initialize - * @param comparator The function to use to compare two heap elements - * @param swapper The function to use to swap two heap elements - * @param array The array of elements (not modified by this call) - * @param capacity The maximum number of elements which fit in the array - * @param element_size The size of every array element, in bytes - **/ void initialize_heap(struct heap *heap, heap_comparator *comparator, heap_swapper *swapper, void *array, size_t capacity, size_t element_size); -/** - * Build a max-heap in place in an array (heapify it) by re-ordering the - * elements to establish the heap invariant. Before calling this function, - * first copy the elements to be arranged into a heap into the array that was - * passed to initialize_heap(). This operation has O(N) time complexity in the - * number of elements in the array. - * - * @param heap The heap to build - * @param count The number of elements in the array to build into a heap - **/ void build_heap(struct heap *heap, size_t count); /** - * Check whether the heap is currently empty. + * is_heap_empty() - Check whether the heap is currently empty. + * @heap: The heap to query. * - * @param heap The heap to query - * - * @return true if there are no elements in the heap - **/ + * Return: true if there are no elements in the heap. + */ static inline bool is_heap_empty(const struct heap *heap) { return (heap->count == 0); } -/** - * Remove the largest element from the top of the heap and restore the heap - * invariant on the remaining elements. This operation has O(log2(N)) time - * complexity. - * - * @param [in] heap The heap to modify - * @param [out] element_ptr A pointer to receive the largest element (may be - * NULL if the caller just wishes to discard it) - * - * @return false if the heap was empty, so no element was removed - **/ bool pop_max_heap_element(struct heap *heap, void *element_ptr); -/** - * Sort the elements contained in a heap. - * - * This function re-orders the elements contained in the heap to a sorted - * array in-place by repeatedly popping the maximum element off the heap and - * moving it to the spot vacated at the end of the heap array. When the - * function returns, the heap will be empty and the array will contain the - * elements in sorted order, from heap minimum to heap maximum. The sort is - * unstable--relative ordering of equal keys is not preserved. This operation - * has O(N*log2(N)) time complexity. - * - * @param heap The heap containing the elements to sort - * - * @return the number of elements that were sorted - **/ size_t sort_heap(struct heap *heap); -/** - * Gets the next sorted heap element and returns a pointer to it, in O(log2(N)) - * time. - * - * @param heap The heap to sort one more step - * - * @return a pointer to the element sorted, or NULL if already fully sorted. - **/ void *sort_next_heap_element(struct heap *heap); #endif /* HEAP_H */ diff --git a/vdo/histogram.c b/vdo/histogram.c deleted file mode 100644 index 9034b015..00000000 --- a/vdo/histogram.c +++ /dev/null @@ -1,881 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/kernel/histogram.c#9 $ - */ - -#include - -#include "memoryAlloc.h" -#include "typeDefs.h" -#include "permassert.h" - -#include "histogram.h" -#include "logger.h" -#include "numUtils.h" - -/* - * Set NO_BUCKETS to streamline the histogram code by reducing it to - * tracking just minimum, maximum, mean, etc. Only one bucket counter - * (the final one for "bigger" values) will be used, no range checking - * is needed to find the right bucket, and no histogram will be - * reported. With newer compilers, the histogram output code will be - * optimized out. - */ -enum { - NO_BUCKETS = 1 -}; - -/* - * Support histogramming in the VDO code. - * - * This is not a complete and general histogram package. It follows the XP - * practice of implementing the "customer" requirements, and no more. We can - * support other requirements after we know what they are. - * - * The code was originally borrowed from UDS, and includes both linear and - * logarithmic histograms. VDO only uses the logarithmic histograms. - * - * All samples are uint64_t values. - * - * A unit conversion option is supported internally to allow sample values to - * be supplied in "jiffies" and results to be reported via /sys in - * milliseconds. Depending on the system configuration, this could mean a - * factor of four (a bucket for values of 1 jiffy is reported as 4-7 - * milliseconds). In theory it could be a non-integer ratio (including less - * than one), but as the x86-64 platforms we've encountered appear to use 1 or - * 4 milliseconds per jiffy, we don't support non-integer values yet. - * - * All internal processing uses the values as passed to enter_histogram_sample. - * Conversions only affect the values seen or input through the /sys interface, - * including possibly rounding a "limit" value entered. - */ - -struct histogram { - // These fields are ordered so that enter_histogram_sample touches - // only the first cache line. - atomic64_t *counters; // Counter for each bucket - uint64_t limit; // We want to know how many samples are - // larger - atomic64_t sum; // Sum of all the samples - atomic64_t count; // Number of samples - atomic64_t minimum; // Minimum value - atomic64_t maximum; // Maximum value - atomic64_t unacceptable; // Number of samples that exceed the limit - int num_buckets; // The number of buckets - bool log_flag; // True if the y scale should be logarithmic - // These fields are used only when reporting results. - const char *label; // Histogram label - const char *counted_items; // Name for things being counted - const char *metric; // Term for value used to divide into buckets - const char *sample_units; // Unit for measuring metric; NULL for count - unsigned int conversion_factor; // Converts input units to reporting - // units - struct kobject kobj; -}; - -/* - * Fixed table defining the top value for each bucket of a logarithmic - * histogram. We arbitrarily limit the histogram to 12 orders of magnitude. - */ -enum { MAX_LOG_SIZE = 12 }; -static const uint64_t bottom_value[1 + 10 * MAX_LOG_SIZE] = { - // 0 to 10 - The first 10 buckets are linear - 0, - 1, - 2, - 3, - 4, - 5, - 6, - 7, - 8, - 9, - 10, - // 10 to 100 - From this point on, the Nth entry of the table is - // floor(exp10((double) N/10.0)). - 12, - 15, - 19, - 25, - 31, - 39, - 50, - 63, - 79, - 100, - // 100 to 1K - 125, - 158, - 199, - 251, - 316, - 398, - 501, - 630, - 794, - 1000, - // 1K to 10K - 1258, - 1584, - 1995, - 2511, - 3162, - 3981, - 5011, - 6309, - 7943, - 10000, - // 10K to 100K - 12589, - 15848, - 19952, - 25118, - 31622, - 39810, - 50118, - 63095, - 79432, - 100000, - // 100K to 1M - 125892, - 158489, - 199526, - 251188, - 316227, - 398107, - 501187, - 630957, - 794328, - 1000000, - // 1M to 10M - 1258925, - 1584893, - 1995262, - 2511886, - 3162277, - 3981071, - 5011872, - 6309573, - 7943282, - 10000000, - // 10M to 100M - 12589254, - 15848931, - 19952623, - 25118864, - 31622776, - 39810717, - 50118723, - 63095734, - 79432823, - 100000000, - // 100M to 1G - 125892541, - 158489319, - 199526231, - 251188643, - 316227766, - 398107170, - 501187233, - 630957344, - 794328234, - 1000000000, - // 1G to 10G - 1258925411L, - 1584893192L, - 1995262314L, - 2511886431L, - 3162277660L, - 3981071705L, - 5011872336L, - 6309573444L, - 7943282347L, - 10000000000L, - // 10G to 100G - 12589254117L, - 15848931924L, - 19952623149L, - 25118864315L, - 31622776601L, - 39810717055L, - 50118723362L, - 63095734448L, - 79432823472L, - 100000000000L, - // 100G to 1T - 125892541179L, - 158489319246L, - 199526231496L, - 251188643150L, - 316227766016L, - 398107170553L, - 501187233627L, - 630957344480L, - 794328234724L, - 1000000000000L, -}; - -/***********************************************************************/ -static unsigned int divide_rounding_to_nearest(uint64_t number, - uint64_t divisor) -{ - number += divisor / 2; - return number / divisor; -} - -/***********************************************************************/ -static int max_bucket(struct histogram *h) -{ - int max = h->num_buckets; - - while ((max >= 0) && (atomic64_read(&h->counters[max]) == 0)) { - max--; - } - // max == -1 means that there were no samples - return max; -} - -/***********************************************************************/ - -struct histogram_attribute { - struct attribute attr; - ssize_t (*show)(struct histogram *h, char *buf); - ssize_t (*store)(struct histogram *h, const char *buf, size_t length); -}; - -/***********************************************************************/ -static void histogram_kobj_release(struct kobject *kobj) -{ - struct histogram *h = container_of(kobj, struct histogram, kobj); - - UDS_FREE(h->counters); - UDS_FREE(h); -} - -/***********************************************************************/ -static ssize_t histogram_show(struct kobject *kobj, - struct attribute *attr, - char *buf) -{ - struct histogram_attribute *ha = - container_of(attr, struct histogram_attribute, attr); - struct histogram *h = container_of(kobj, struct histogram, kobj); - if (ha->show == NULL) { - return -EINVAL; - } - - return ha->show(h, buf); -} - -/***********************************************************************/ -static ssize_t histogram_store(struct kobject *kobj, - struct attribute *attr, - const char *buf, - size_t length) -{ - struct histogram_attribute *ha = - container_of(attr, struct histogram_attribute, attr); - struct histogram *h = container_of(kobj, struct histogram, kobj); - - if (ha->show == NULL) { - return -EINVAL; - } - - return ha->store(h, buf, length); -} - -/***********************************************************************/ -static ssize_t histogram_show_count(struct histogram *h, char *buf) -{ - int64_t count = atomic64_read(&h->count); - - return sprintf(buf, "%lld\n", count); -} - -/***********************************************************************/ -static ssize_t histogram_show_histogram(struct histogram *h, char *buffer) -{ - /* - * We're given one page in which to write. The caller logs a complaint - * if we report that we've written too much, so we'll truncate to - * PAGE_SIZE-1. - */ - ssize_t buffer_size = PAGE_SIZE; - bool bars = true; - ssize_t length = 0; - int max = max_bucket(h); - uint64_t total = 0; - int i; - - // If max is -1, we'll fall through to reporting the total of zero. - - enum { BAR_SIZE = 50 }; - char bar[BAR_SIZE + 2]; - - bar[0] = ' '; - memset(bar + 1, '=', BAR_SIZE); - bar[BAR_SIZE + 1] = '\0'; - - for (i = 0; i <= max; i++) { - total += atomic64_read(&h->counters[i]); - } - - length += scnprintf(buffer, - buffer_size, - "%s Histogram - number of %s by %s", - h->label, - h->counted_items, - h->metric); - if (length >= (buffer_size - 1)) { - return buffer_size - 1; - } - if (h->sample_units != NULL) { - length += scnprintf(buffer + length, - buffer_size - length, - " (%s)", - h->sample_units); - if (length >= (buffer_size - 1)) { - return buffer_size - 1; - } - } - length += scnprintf(buffer + length, buffer_size - length, "\n"); - if (length >= (buffer_size - 1)) { - return buffer_size - 1; - } - for (i = 0; i <= max; i++) { - uint64_t value = atomic64_read(&h->counters[i]); - - unsigned int bar_length; - - if (bars && (total != 0)) { - // +1 for the space at the beginning - bar_length = (divide_rounding_to_nearest( - value * BAR_SIZE, total) + - 1); - if (bar_length == 1) { - // Don't bother printing just the initial space. - bar_length = 0; - } - } else { - // 0 means skip the space and the bar - bar_length = 0; - } - - if (h->log_flag) { - if (i == h->num_buckets) { - length += scnprintf(buffer + length, - buffer_size - length, - "%-16s", - "Bigger"); - } else { - unsigned int lower = - h->conversion_factor * bottom_value[i]; - unsigned int upper = - h->conversion_factor * - bottom_value[i + 1] - - 1; - length += scnprintf(buffer + length, - buffer_size - length, - "%6u - %7u", - lower, - upper); - } - } else { - if (i == h->num_buckets) { - length += scnprintf(buffer + length, - buffer_size - length, - "%6s", - "Bigger"); - } else { - length += scnprintf(buffer + length, - buffer_size - length, - "%6d", - i); - } - } - if (length >= (buffer_size - 1)) { - return buffer_size - 1; - } - length += scnprintf(buffer + length, - buffer_size - length, - " : %12llu%.*s\n", - value, - bar_length, - bar); - if (length >= (buffer_size - 1)) { - return buffer_size - 1; - } - } - - length += scnprintf(buffer + length, - buffer_size - length, - "total %llu\n", - total); - return min(buffer_size - 1, length); -} - -/***********************************************************************/ -static ssize_t histogram_show_maximum(struct histogram *h, char *buf) -{ - // Maximum is initialized to 0. - unsigned long value = atomic64_read(&h->maximum); - - return sprintf(buf, "%lu\n", h->conversion_factor * value); -} - -/***********************************************************************/ -static ssize_t histogram_show_minimum(struct histogram *h, char *buf) -{ - // Minimum is initialized to -1. - unsigned long value = - ((atomic64_read(&h->count) > 0) ? atomic64_read(&h->minimum) : - 0); - return sprintf(buf, "%lu\n", h->conversion_factor * value); -} - -/***********************************************************************/ -static ssize_t histogram_show_limit(struct histogram *h, char *buf) -{ - // Display the limit in the reporting units - return sprintf(buf, - "%u\n", - (unsigned int) (h->conversion_factor * h->limit)); -} - -/***********************************************************************/ -static ssize_t histogram_store_limit(struct histogram *h, - const char *buf, - size_t length) -{ - unsigned int value; - - if ((length > 12) || (sscanf(buf, "%u", &value) != 1)) { - return -EINVAL; - } - /* - * Convert input from reporting units (e.g., milliseconds) to internal - * recording units (e.g., jiffies). - * - * compute_bucket_count could also be called "divide_rounding_up". - */ - h->limit = compute_bucket_count(value, h->conversion_factor); - atomic64_set(&h->unacceptable, 0); - return length; -} - -/***********************************************************************/ -static ssize_t histogram_show_mean(struct histogram *h, char *buf) -{ - unsigned long sum_times1000_in_reporting_units; - unsigned int mean_times1000; - uint64_t count = atomic64_read(&h->count); - - if (count == 0) { - return sprintf(buf, "0/0\n"); - } - // Compute mean, scaled up by 1000, in reporting units - sum_times1000_in_reporting_units = - h->conversion_factor * atomic64_read(&h->sum) * 1000; - mean_times1000 = divide_rounding_to_nearest( - sum_times1000_in_reporting_units, count); - // Print mean with fractional part - return sprintf(buf, - "%u.%03u\n", - mean_times1000 / 1000, - mean_times1000 % 1000); -} - -/***********************************************************************/ -static ssize_t histogram_show_unacceptable(struct histogram *h, char *buf) -{ - int64_t count = atomic64_read(&h->unacceptable); - - return sprintf(buf, "%lld\n", count); -} - -/***********************************************************************/ -static ssize_t histogram_show_label(struct histogram *h, char *buf) -{ - return sprintf(buf, "%s\n", h->label); -} - -/***********************************************************************/ -static ssize_t histogram_show_unit(struct histogram *h, char *buf) -{ - if (h->sample_units != NULL) { - return sprintf(buf, "%s\n", h->sample_units); - } else { - *buf = 0; - return 0; - } -} - -/***********************************************************************/ - -static struct sysfs_ops histogram_sysfs_ops = { - .show = histogram_show, - .store = histogram_store, -}; - -static struct histogram_attribute count_attribute = { - .attr = { - .name = "count", - .mode = 0444, - }, - .show = histogram_show_count, -}; - -static struct histogram_attribute histogram_attribute = { - .attr = { - .name = "histogram", - .mode = 0444, - }, - .show = histogram_show_histogram, -}; - -static struct histogram_attribute label_attribute = { - .attr = { - .name = "label", - .mode = 0444, - }, - .show = histogram_show_label, -}; - -static struct histogram_attribute maximum_attribute = { - .attr = { - .name = "maximum", - .mode = 0444, - }, - .show = histogram_show_maximum, -}; - -static struct histogram_attribute minimum_attribute = { - .attr = { - .name = "minimum", - .mode = 0444, - }, - .show = histogram_show_minimum, -}; - -static struct histogram_attribute limit_attribute = { - .attr = { - .name = "limit", - .mode = 0644, - }, - .show = histogram_show_limit, - .store = histogram_store_limit, -}; - -static struct histogram_attribute mean_attribute = { - .attr = { - .name = "mean", - .mode = 0444, - }, - .show = histogram_show_mean, -}; - -static struct histogram_attribute unacceptable_attribute = { - .attr = { - .name = "unacceptable", - .mode = 0444, - }, - .show = histogram_show_unacceptable, -}; - -static struct histogram_attribute unit_attribute = { - .attr = { - .name = "unit", - .mode = 0444, - }, - .show = histogram_show_unit, -}; - -// "Real" histogram plotting. -static struct attribute *histogram_attributes[] = { - &count_attribute.attr, - &histogram_attribute.attr, - &label_attribute.attr, - &limit_attribute.attr, - &maximum_attribute.attr, - &mean_attribute.attr, - &minimum_attribute.attr, - &unacceptable_attribute.attr, - &unit_attribute.attr, - NULL, -}; - -static struct kobj_type histogram_kobj_type = { - .release = histogram_kobj_release, - .sysfs_ops = &histogram_sysfs_ops, - .default_attrs = histogram_attributes, -}; - -static struct attribute *bucketless_histogram_attributes[] = { - &count_attribute.attr, - &label_attribute.attr, - &maximum_attribute.attr, - &mean_attribute.attr, - &minimum_attribute.attr, - &unit_attribute.attr, - NULL, -}; - -static struct kobj_type bucketless_histogram_kobj_type = { - .release = histogram_kobj_release, - .sysfs_ops = &histogram_sysfs_ops, - .default_attrs = bucketless_histogram_attributes, -}; - -/***********************************************************************/ -static struct histogram *make_histogram(struct kobject *parent, - const char *name, - const char *label, - const char *counted_items, - const char *metric, - const char *sample_units, - int num_buckets, - unsigned long conversion_factor, - bool log_flag) -{ - struct histogram *h; - - if (UDS_ALLOCATE(1, struct histogram, "histogram", &h) != UDS_SUCCESS) { - return NULL; - } - - if (NO_BUCKETS) { - num_buckets = 0; // plus 1 for "bigger" bucket - } - - if (num_buckets <= 10) { - /* - * The first buckets in a "logarithmic" histogram are still - * linear, but the bucket-search mechanism is a wee bit slower - * than for linear, so change the type. - */ - log_flag = false; - } - - h->label = label; - h->counted_items = counted_items; - h->metric = metric; - h->sample_units = sample_units; - h->log_flag = log_flag; - h->num_buckets = num_buckets; - h->conversion_factor = conversion_factor; - atomic64_set(&h->minimum, -1UL); - - if (UDS_ALLOCATE(h->num_buckets + 1, - atomic64_t, - "histogram counters", - &h->counters) != UDS_SUCCESS) { - histogram_kobj_release(&h->kobj); - return NULL; - } - - kobject_init(&h->kobj, - ((num_buckets > 0) ? &histogram_kobj_type : - &bucketless_histogram_kobj_type)); - if (kobject_add(&h->kobj, parent, name) != 0) { - histogram_kobj_release(&h->kobj); - return NULL; - } - return h; -} - -/***********************************************************************/ -struct histogram *make_linear_histogram(struct kobject *parent, - const char *name, - const char *init_label, - const char *counted_items, - const char *metric, - const char *sample_units, - int size) -{ - return make_histogram(parent, - name, - init_label, - counted_items, - metric, - sample_units, - size, - 1, - false); -} - -/** - * Intermediate routine for creating logarithmic histograms. - * - * Limits the histogram size, and computes the bucket count from the - * orders-of-magnitude count. - * - * @param parent The parent kobject. - * @param name The short name of the histogram. This label is - * used for the sysfs node. - * @param init_label The label for the sampled data. This label is - * used when we plot the data. - * @param counted_items A name (plural) for the things being counted. - * @param metric The measure being used to divide samples into - * buckets. - * @param sample_units The units (plural) for the metric, or NULL if it's - * a simple counter. - * @param log_size The number of buckets. There are buckets for a - * range of sizes up to 10^log_size, and an extra - * bucket for larger samples. - * @param conversion_factor Unit conversion factor for reporting. - * - * @return the histogram - **/ -static struct histogram * -make_logarithmic_histogram_with_conversion_factor(struct kobject *parent, - const char *name, - const char *init_label, - const char *counted_items, - const char *metric, - const char *sample_units, - int log_size, - uint64_t conversion_factor) -{ - if (log_size > MAX_LOG_SIZE) { - log_size = MAX_LOG_SIZE; - } - return make_histogram(parent, - name, - init_label, - counted_items, - metric, - sample_units, - 10 * log_size, - conversion_factor, - true); -} - -/***********************************************************************/ -struct histogram *make_logarithmic_histogram(struct kobject *parent, - const char *name, - const char *init_label, - const char *counted_items, - const char *metric, - const char *sample_units, - int log_size) -{ - return make_logarithmic_histogram_with_conversion_factor(parent, - name, - init_label, - counted_items, - metric, - sample_units, - log_size, - 1); -} - -/***********************************************************************/ -struct histogram *make_logarithmic_jiffies_histogram(struct kobject *parent, - const char *name, - const char *init_label, - const char *counted_items, - const char *metric, - int log_size) -{ - /* - * If these fail, we have a jiffy duration that is not an integral - * number of milliseconds, and the unit conversion code needs updating. - */ - STATIC_ASSERT(HZ <= MSEC_PER_SEC); - STATIC_ASSERT((MSEC_PER_SEC % HZ) == 0); - return make_logarithmic_histogram_with_conversion_factor( - parent, - name, - init_label, - counted_items, - metric, - "milliseconds", - log_size, - jiffies_to_msecs(1)); -} - -/***********************************************************************/ -void enter_histogram_sample(struct histogram *h, uint64_t sample) -{ - uint64_t old_minimum, old_maximum; - int bucket; - - if (h == NULL) { - return; - } - - if (h->log_flag) { - int lo = 0; - int hi = h->num_buckets; - - while (lo < hi) { - int middle = (lo + hi) / 2; - - if (sample < bottom_value[middle + 1]) { - hi = middle; - } else { - lo = middle + 1; - } - } - bucket = lo; - } else { - bucket = sample < h->num_buckets ? sample : h->num_buckets; - } - atomic64_inc(&h->counters[bucket]); - atomic64_inc(&h->count); - atomic64_add(sample, &h->sum); - if ((h->limit > 0) && (sample > h->limit)) { - atomic64_inc(&h->unacceptable); - } - - /* - * Theoretically this could loop a lot; in practice it should rarely - * do more than a single read, with no memory barrier, from a cache - * line we've already referenced above. - */ - old_maximum = atomic64_read(&h->maximum); - - while (old_maximum < sample) { - uint64_t read_value = - atomic64_cmpxchg(&h->maximum, old_maximum, sample); - if (read_value == old_maximum) { - break; - } - old_maximum = read_value; - } - - old_minimum = atomic64_read(&h->minimum); - - while (old_minimum > sample) { - uint64_t read_value = - atomic64_cmpxchg(&h->minimum, old_minimum, sample); - if (read_value == old_minimum) { - break; - } - old_minimum = read_value; - } -} - -/***********************************************************************/ -void free_histogram(struct histogram *histogram) -{ - if (histogram != NULL) { - kobject_put(&histogram->kobj); - } -} diff --git a/vdo/histogram.h b/vdo/histogram.h deleted file mode 100644 index 49d146e4..00000000 --- a/vdo/histogram.h +++ /dev/null @@ -1,132 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/kernel/histogram.h#4 $ - */ - -#ifndef HISTOGRAM_H -#define HISTOGRAM_H - -#include - -/** - * Allocate and initialize a histogram that uses linearly sized buckets. - * - * The histogram label reported via /sys is constructed from several - * of the values passed here; it will be something like "Init Label - * Histogram - number of counted_items grouped by metric - * (sample_units)", e.g., "Flush Forwarding Histogram - number of - * flushes grouped by latency (milliseconds)". Thus counted_items and - * sample_units should be plural. - * - * The sample_units string will also be reported separately via another - * /sys entry to aid in programmatic processing of the results, so the - * strings used should be consistent (e.g., always "milliseconds" and - * not "ms" for milliseconds). - * - * @param parent The parent kobject. - * @param name The short name of the histogram. This label is used - * for the sysfs node. - * @param init_label The label for the sampled data. This label is used - * when we plot the data. - * @param counted_items A name (plural) for the things being counted. - * @param metric The measure being used to divide samples into buckets. - * @param sample_units The unit (plural) for the metric, or NULL if it's a - * simple counter. - * @param size The number of buckets. There are buckets for every - * value from 0 up to size (but not including) size. - * There is an extra bucket for larger samples. - * - * @return the histogram - **/ -struct histogram *make_linear_histogram(struct kobject *parent, - const char *name, - const char *init_label, - const char *counted_items, - const char *metric, - const char *sample_units, - int size); - -/** - * Allocate and initialize a histogram that uses logarithmically sized - * buckets. - * - * @param parent The parent kobject. - * @param name The short name of the histogram. This label is used - * for the sysfs node. - * @param init_label The label for the sampled data. This label is used - * when we plot the data. - * @param counted_items A name (plural) for the things being counted. - * @param metric The measure being used to divide samples into buckets. - * @param sample_units The unit (plural) for the metric, or NULL if it's a - * simple counter. - * @param log_size The number of buckets. There are buckets for a range - * of sizes up to 10^log_size, and an extra bucket for - * larger samples. - * - * @return the histogram - **/ -struct histogram *make_logarithmic_histogram(struct kobject *parent, - const char *name, - const char *init_label, - const char *counted_items, - const char *metric, - const char *sample_units, - int log_size); - -/** - * Allocate and initialize a histogram that uses logarithmically sized - * buckets. Values are entered that count in jiffies, and they are - * reported in milliseconds. - * - * @param parent The parent kobject. - * @param name The short name of the histogram. This label is used - * for the sysfs node. - * @param init_label The label for the sampled data. This label is used - * when we plot the data. - * @param counted_items A name (plural) for the things being counted. - * @param metric The measure being used to divide samples into buckets. - * @param log_size The number of buckets. There are buckets for a range - * of sizes up to 10^log_size, and an extra bucket for - * larger samples. - * - * @return the histogram - **/ -struct histogram *make_logarithmic_jiffies_histogram(struct kobject *parent, - const char *name, - const char *init_label, - const char *counted_items, - const char *metric, - int log_size); - -/** - * Enter a sample into a histogram. - * - * @param h The histogram (may be NULL) - * @param sample The sample - **/ -void enter_histogram_sample(struct histogram *h, uint64_t sample); - -/** - * Free a histogram. - * - * @param histogram The histogram to free - **/ -void free_histogram(struct histogram *histogram); - -#endif /* HISTOGRAM_H */ diff --git a/vdo/index-layout.c b/vdo/index-layout.c new file mode 100644 index 00000000..a5a1df42 --- /dev/null +++ b/vdo/index-layout.c @@ -0,0 +1,2421 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright Red Hat + */ + +#include "index-layout.h" + +#include + +#include "buffer.h" +#include "compiler.h" +#include "config.h" +#include "logger.h" +#include "memory-alloc.h" +#include "numeric.h" +#include "open-chapter.h" +#include "random.h" +#include "time-utils.h" +#include "volume-index-ops.h" + +/* + * The UDS layout on storage media is divided into a number of fixed-size + * regions, the sizes of which are computed when the index is created. Every + * header and region begins on 4K block boundary. Save regions are further + * sub-divided into regions of their own. + * + * Each region has a kind and an instance number. Some kinds only have one + * instance and therefore use RL_SOLE_INSTANCE (-1) as the instance number. + * The RL_KIND_INDEX used to use instances to represent sub-indices; now, + * however there is only ever one sub-index and therefore one instance. A save + * region holds a clean shutdown. The instances determine which available save + * slot is used. The RL_KIND_VOLUME_INDEX uses instances to record which zone + * is being saved. + * + * Every region header has a type and version. + * + * +-+-+---------+--------+--------+-+ + * | | | I N D E X 0 101, 0 | | + * |H|C+---------+--------+--------+S| + * |D|f| Volume | Save | Save |e| + * |R|g| Region | Region | Region |a| + * | | | 201, -1 | 202, 0 | 202, 1 |l| + * +-+-+--------+---------+--------+-+ + * + * The header contains the encoded region layout table as well as some index + * configuration data. The sub-index region and its subdivisions are maintained + * in the same table. + * + * There are two save regions to preserve the old state in case saving the new + * state is incomplete. They are used in alternation. Each save region is + * further divided into sub-regions. + * + * +-+-----+------+------+-----+-----+ + * |H| IPM | MI | MI | | OC | + * |D| | zone | zone | ... | | + * |R| 301 | 302 | 302 | | 303 | + * | | -1 | 0 | 1 | | -1 | + * +-+-----+------+------+-----+-----+ + * + * The header contains the encoded region layout table as well as index state + * data for that save. Each save also has a unique nonce. + */ + +enum { + MAGIC_SIZE = 32, + NONCE_INFO_SIZE = 32, + MAX_SAVES = 2, +}; + +enum region_kind { + RL_KIND_EMPTY = 0, /* uninitialized or scrapped */ + RL_KIND_HEADER = 1, + RL_KIND_CONFIG = 100, + RL_KIND_INDEX = 101, + RL_KIND_SEAL = 102, + RL_KIND_VOLUME = 201, + RL_KIND_SAVE = 202, + RL_KIND_INDEX_PAGE_MAP = 301, + RL_KIND_VOLUME_INDEX = 302, + RL_KIND_OPEN_CHAPTER = 303, +}; + +/* Some region types are historical and are no longer used. */ +enum region_type { + RH_TYPE_FREE = 0, /* unused */ + RH_TYPE_SUPER = 1, + RH_TYPE_SAVE = 2, + RH_TYPE_CHECKPOINT = 3, /* unused */ + RH_TYPE_UNSAVED = 4, +}; + +enum { + RL_SOLE_INSTANCE = 65535, +}; + +/* + * Super block version 2 is the first released version. + * + * Super block version 3 is the normal version used from RHEL 8.2 onwards. + * + * Super block versions 4 through 6 were incremental development versions and + * are not supported. + * + * Super block version 7 is used for volumes which have been reduced in size by + * one chapter in order to make room to prepend LVM metadata to a volume + * originally created without lvm. This allows the index to retain most its + * deduplication records. + */ +enum { + SUPER_VERSION_MINIMUM = 3, + SUPER_VERSION_CURRENT = 3, + SUPER_VERSION_MAXIMUM = 7, +}; + +static const byte LAYOUT_MAGIC[MAGIC_SIZE] = "*ALBIREO*SINGLE*FILE*LAYOUT*001*"; +static const uint64_t REGION_MAGIC = 0x416c6252676e3031; /* 'AlbRgn01' */ + +struct region_header { + uint64_t magic; + uint64_t region_blocks; + uint16_t type; + /* Currently always version 1 */ + uint16_t version; + uint16_t region_count; + uint16_t payload; +}; + +struct layout_region { + uint64_t start_block; + uint64_t block_count; + uint32_t __unused; + uint16_t kind; + uint16_t instance; +}; + +struct region_table { + struct region_header header; + struct layout_region regions[]; +}; + +struct index_save_data { + uint64_t timestamp; + uint64_t nonce; + /* Currently always version 1 */ + uint32_t version; + uint32_t unused__; +}; + +struct index_state_version { + int32_t signature; + int32_t version_id; +}; + +static const struct index_state_version INDEX_STATE_VERSION_301 = { + .signature = -1, + .version_id = 301, +}; + +struct index_state_data301 { + struct index_state_version version; + uint64_t newest_chapter; + uint64_t oldest_chapter; + uint64_t last_save; + uint32_t unused; + uint32_t padding; +}; + +struct index_save_layout { + unsigned int zone_count; + struct layout_region index_save; + struct layout_region header; + struct layout_region index_page_map; + struct layout_region free_space; + struct layout_region volume_index_zones[MAX_ZONES]; + struct layout_region open_chapter; + struct index_save_data save_data; + struct index_state_data301 state_data; +}; + +struct sub_index_layout { + uint64_t nonce; + struct layout_region sub_index; + struct layout_region volume; + struct index_save_layout *saves; +}; + +struct super_block_data { + byte magic_label[MAGIC_SIZE]; + byte nonce_info[NONCE_INFO_SIZE]; + uint64_t nonce; + uint32_t version; + uint32_t block_size; + uint16_t index_count; + uint16_t max_saves; + /* Padding reflects a blank field on permanent storage */ + byte padding[4]; + uint64_t open_chapter_blocks; + uint64_t page_map_blocks; + uint64_t volume_offset; + uint64_t start_offset; +}; + +struct index_layout { + struct io_factory *factory; + size_t factory_size; + off_t offset; + struct super_block_data super; + struct layout_region header; + struct layout_region config; + struct sub_index_layout index; + struct layout_region seal; + uint64_t total_blocks; +}; + +struct save_layout_sizes { + unsigned int save_count; + size_t block_size; + uint64_t volume_blocks; + uint64_t volume_index_blocks; + uint64_t page_map_blocks; + uint64_t open_chapter_blocks; + uint64_t save_blocks; + uint64_t sub_index_blocks; + uint64_t total_blocks; + size_t total_size; +}; + +static INLINE bool is_converted_super_block(struct super_block_data *super) +{ + return (super->version == 7); +} + +static int __must_check compute_sizes(const struct configuration *config, + struct save_layout_sizes *sls) +{ + int result; + struct geometry *geometry = config->geometry; + + if ((geometry->bytes_per_page % UDS_BLOCK_SIZE) != 0) { + return uds_log_error_strerror(UDS_INCORRECT_ALIGNMENT, + "page size not a multiple of block size"); + } + + memset(sls, 0, sizeof(*sls)); + + sls->save_count = MAX_SAVES; + sls->block_size = UDS_BLOCK_SIZE; + sls->volume_blocks = geometry->bytes_per_volume / sls->block_size; + + result = compute_volume_index_save_blocks(config, + sls->block_size, + &sls->volume_index_blocks); + if (result != UDS_SUCCESS) { + return uds_log_error_strerror(result, + "cannot compute index save size"); + } + + sls->page_map_blocks = + DIV_ROUND_UP(compute_index_page_map_save_size(geometry), + sls->block_size); + sls->open_chapter_blocks = + DIV_ROUND_UP(compute_saved_open_chapter_size(geometry), + sls->block_size); + sls->save_blocks = + 1 + (sls->volume_index_blocks + sls->page_map_blocks + + sls->open_chapter_blocks); + sls->sub_index_blocks = + sls->volume_blocks + (sls->save_count * sls->save_blocks); + sls->total_blocks = 3 + sls->sub_index_blocks; + sls->total_size = sls->total_blocks * sls->block_size; + + return UDS_SUCCESS; +} + +int uds_compute_index_size(const struct uds_parameters *parameters, + uint64_t *index_size) +{ + int result; + struct configuration *index_config; + struct save_layout_sizes sizes; + + if (index_size == NULL) { + uds_log_error("Missing output size pointer"); + return -EINVAL; + } + + result = make_configuration(parameters, &index_config); + if (result != UDS_SUCCESS) { + uds_log_error_strerror(result, "cannot compute index size"); + return uds_map_to_system_error(result); + } + + result = compute_sizes(index_config, &sizes); + free_configuration(index_config); + if (result != UDS_SUCCESS) { + return uds_map_to_system_error(result); + } + + *index_size = sizes.total_size; + return UDS_SUCCESS; +} + +/* Create unique data using the current time and a pseudorandom number. */ +static void create_unique_nonce_data(byte *buffer) +{ + ktime_t now = current_time_ns(CLOCK_REALTIME); + uint32_t rand = random_in_range(1, (1 << 30) - 1); + size_t offset = 0; + + memcpy(buffer + offset, &now, sizeof(now)); + offset += sizeof(now); + memcpy(buffer + offset, &rand, sizeof(rand)); + offset += sizeof(rand); + while (offset < NONCE_INFO_SIZE) { + size_t len = min(NONCE_INFO_SIZE - offset, offset); + + memcpy(buffer + offset, buffer, len); + offset += len; + } +} + +static uint64_t hash_stuff(uint64_t start, const void *data, size_t len) +{ + uint32_t seed = start ^ (start >> 27); + byte hash_buffer[16]; + + murmurhash3_128(data, len, seed, hash_buffer); + return get_unaligned_le64(hash_buffer + 4); +} + +/* Generate a primary nonce from the provided data. */ +static uint64_t generate_primary_nonce(const void *data, size_t len) +{ + return hash_stuff(0xa1b1e0fc, data, len); +} + +/* + * Deterministically generate a secondary nonce from an existing nonce and some + * arbitrary data by hashing the original nonce and the data to produce a new + * nonce. + */ +static uint64_t +generate_secondary_nonce(uint64_t nonce, const void *data, size_t len) +{ + return hash_stuff(nonce + 1, data, len); +} + +static int __must_check +open_layout_reader(struct index_layout *layout, + struct layout_region *lr, + off_t offset, + struct buffered_reader **reader_ptr) +{ + off_t start = (lr->start_block + offset) * layout->super.block_size; + size_t size = lr->block_count * layout->super.block_size; + + return open_uds_buffered_reader(layout->factory, + start, + size, + reader_ptr); +} + +static int open_region_reader(struct index_layout *layout, + struct layout_region *region, + struct buffered_reader **reader_ptr) +{ + return open_layout_reader(layout, + region, + -layout->super.start_offset, + reader_ptr); +} + +static int __must_check +open_layout_writer(struct index_layout *layout, + struct layout_region *lr, + off_t offset, + struct buffered_writer **writer_ptr) +{ + off_t start = (lr->start_block + offset) * layout->super.block_size; + size_t size = lr->block_count * layout->super.block_size; + + return open_uds_buffered_writer(layout->factory, + start, + size, + writer_ptr); +} + +static int open_region_writer(struct index_layout *layout, + struct layout_region *region, + struct buffered_writer **writer_ptr) +{ + return open_layout_writer(layout, + region, + -layout->super.start_offset, + writer_ptr); +} + +static int __must_check +decode_region_header(struct buffer *buffer, struct region_header *header) +{ + int result; + + result = get_uint64_le_from_buffer(buffer, &header->magic); + if (result != UDS_SUCCESS) { + return result; + } + + result = get_uint64_le_from_buffer(buffer, &header->region_blocks); + if (result != UDS_SUCCESS) { + return result; + } + + result = get_uint16_le_from_buffer(buffer, &header->type); + if (result != UDS_SUCCESS) { + return result; + } + + result = get_uint16_le_from_buffer(buffer, &header->version); + if (result != UDS_SUCCESS) { + return result; + } + + result = get_uint16_le_from_buffer(buffer, &header->region_count); + if (result != UDS_SUCCESS) { + return result; + } + + return get_uint16_le_from_buffer(buffer, &header->payload); +} + +static int __must_check +decode_layout_region(struct buffer *buffer, struct layout_region *region) +{ + int result; + + result = get_uint64_le_from_buffer(buffer, ®ion->start_block); + if (result != UDS_SUCCESS) { + return result; + } + + result = get_uint64_le_from_buffer(buffer, ®ion->block_count); + if (result != UDS_SUCCESS) { + return result; + } + + result = skip_forward(buffer, sizeof(uint32_t)); + if (result != UDS_SUCCESS) { + return result; + } + + result = get_uint16_le_from_buffer(buffer, ®ion->kind); + if (result != UDS_SUCCESS) { + return result; + } + + return get_uint16_le_from_buffer(buffer, ®ion->instance); +} + +static int __must_check load_region_table(struct buffered_reader *reader, + struct region_table **table_ptr) +{ + int result; + unsigned int i; + struct region_header header; + struct region_table *table; + struct buffer *buffer; + + result = make_buffer(sizeof(struct region_header), &buffer); + if (result != UDS_SUCCESS) { + return result; + } + + result = read_from_buffered_reader(reader, + get_buffer_contents(buffer), + buffer_length(buffer)); + if (result != UDS_SUCCESS) { + free_buffer(UDS_FORGET(buffer)); + return uds_log_error_strerror(result, + "cannot read region table header"); + } + + result = reset_buffer_end(buffer, buffer_length(buffer)); + if (result != UDS_SUCCESS) { + free_buffer(UDS_FORGET(buffer)); + return result; + } + + result = decode_region_header(buffer, &header); + free_buffer(UDS_FORGET(buffer)); + if (result != UDS_SUCCESS) { + return result; + } + + if (header.magic != REGION_MAGIC) { + return UDS_NO_INDEX; + } + + if (header.version != 1) { + return uds_log_error_strerror(UDS_UNSUPPORTED_VERSION, + "unknown region table version %hu", + header.version); + } + + result = UDS_ALLOCATE_EXTENDED(struct region_table, + header.region_count, + struct layout_region, + "single file layout region table", + &table); + if (result != UDS_SUCCESS) { + return result; + } + + table->header = header; + result = make_buffer(header.region_count * sizeof(struct layout_region), + &buffer); + if (result != UDS_SUCCESS) { + UDS_FREE(table); + return result; + } + + result = read_from_buffered_reader(reader, + get_buffer_contents(buffer), + buffer_length(buffer)); + if (result != UDS_SUCCESS) { + UDS_FREE(table); + free_buffer(UDS_FORGET(buffer)); + return uds_log_error_strerror(UDS_CORRUPT_DATA, + "cannot read region table layouts"); + } + + result = reset_buffer_end(buffer, buffer_length(buffer)); + if (result != UDS_SUCCESS) { + UDS_FREE(table); + free_buffer(UDS_FORGET(buffer)); + return result; + } + + for (i = 0; i < header.region_count; i++) { + result = decode_layout_region(buffer, &table->regions[i]); + if (result != UDS_SUCCESS) { + UDS_FREE(table); + free_buffer(UDS_FORGET(buffer)); + return result; + } + } + + free_buffer(UDS_FORGET(buffer)); + *table_ptr = table; + return UDS_SUCCESS; +} + +static int __must_check +decode_super_block_data(struct buffer *buffer, struct super_block_data *super) +{ + int result; + + result = get_bytes_from_buffer(buffer, MAGIC_SIZE, super->magic_label); + if (result != UDS_SUCCESS) { + return result; + } + + result = get_bytes_from_buffer(buffer, + NONCE_INFO_SIZE, + super->nonce_info); + if (result != UDS_SUCCESS) { + return result; + } + + result = get_uint64_le_from_buffer(buffer, &super->nonce); + if (result != UDS_SUCCESS) { + return result; + } + + result = get_uint32_le_from_buffer(buffer, &super->version); + if (result != UDS_SUCCESS) { + return result; + } + + result = get_uint32_le_from_buffer(buffer, &super->block_size); + if (result != UDS_SUCCESS) { + return result; + } + + result = get_uint16_le_from_buffer(buffer, &super->index_count); + if (result != UDS_SUCCESS) { + return result; + } + + result = get_uint16_le_from_buffer(buffer, &super->max_saves); + if (result != UDS_SUCCESS) { + return result; + } + + result = skip_forward(buffer, 4); /* aligment */ + if (result != UDS_SUCCESS) { + return result; + } + + result = get_uint64_le_from_buffer(buffer, &super->open_chapter_blocks); + if (result != UDS_SUCCESS) { + return result; + } + + result = get_uint64_le_from_buffer(buffer, &super->page_map_blocks); + if (result != UDS_SUCCESS) { + return result; + } + + if (is_converted_super_block(super)) { + result = get_uint64_le_from_buffer(buffer, + &super->volume_offset); + if (result != UDS_SUCCESS) { + return result; + } + + result = get_uint64_le_from_buffer(buffer, + &super->start_offset); + if (result != UDS_SUCCESS) { + return result; + } + } else { + super->volume_offset = 0; + super->start_offset = 0; + } + + return UDS_SUCCESS; +} + +static int __must_check read_super_block_data(struct buffered_reader *reader, + struct index_layout *layout, + size_t saved_size) +{ + int result; + struct super_block_data *super = &layout->super; + struct buffer *buffer; + + if (sizeof(super->magic_label) != MAGIC_SIZE) { + return uds_log_error_strerror(UDS_CORRUPT_DATA, + "super block magic label size incorrect"); + } + + result = make_buffer(saved_size, &buffer); + if (result != UDS_SUCCESS) { + return result; + } + + result = read_from_buffered_reader(reader, + get_buffer_contents(buffer), + buffer_length(buffer)); + if (result != UDS_SUCCESS) { + free_buffer(UDS_FORGET(buffer)); + return uds_log_error_strerror(result, + "cannot read region table header"); + } + + result = reset_buffer_end(buffer, buffer_length(buffer)); + if (result != UDS_SUCCESS) { + free_buffer(UDS_FORGET(buffer)); + return result; + } + + result = decode_super_block_data(buffer, super); + free_buffer(UDS_FORGET(buffer)); + if (result != UDS_SUCCESS) { + return uds_log_error_strerror(result, + "cannot read super block data"); + } + + if (memcmp(super->magic_label, LAYOUT_MAGIC, MAGIC_SIZE) != 0) { + return uds_log_error_strerror(UDS_CORRUPT_DATA, + "unknown superblock magic label"); + } + + if ((super->version < SUPER_VERSION_MINIMUM) || + (super->version == 4) || + (super->version == 5) || + (super->version == 6) || + (super->version > SUPER_VERSION_MAXIMUM)) { + return uds_log_error_strerror(UDS_UNSUPPORTED_VERSION, + "unknown superblock version number %u", + super->version); + } + + if (super->volume_offset < super->start_offset) { + return uds_log_error_strerror(UDS_CORRUPT_DATA, + "inconsistent offsets (start %llu, volume %llu)", + (unsigned long long) super->start_offset, + (unsigned long long) super->volume_offset); + } + + /* Sub-indexes are no longer used but the layout retains this field. */ + if (super->index_count != 1) { + return uds_log_error_strerror(UDS_CORRUPT_DATA, + "invalid subindex count %u", + super->index_count); + } + + if (generate_primary_nonce(super->nonce_info, + sizeof(super->nonce_info)) != super->nonce) { + return uds_log_error_strerror(UDS_CORRUPT_DATA, + "inconsistent superblock nonce"); + } + + return UDS_SUCCESS; +} + +static void define_sub_index_nonce(struct index_layout *layout) +{ + struct sub_index_nonce_data { + uint64_t offset; + uint16_t index_id; + }; + struct sub_index_layout *sil = &layout->index; + uint64_t primary_nonce = layout->super.nonce; + byte buffer[sizeof(struct sub_index_nonce_data)] = { 0 }; + size_t offset = 0; + + encode_uint64_le(buffer, &offset, sil->sub_index.start_block); + encode_uint16_le(buffer, &offset, 0); + sil->nonce = generate_secondary_nonce(primary_nonce, + buffer, + sizeof(buffer)); + if (sil->nonce == 0) { + sil->nonce = generate_secondary_nonce(~primary_nonce + 1, + buffer, + sizeof(buffer)); + } +} + +static int __must_check verify_region(struct layout_region *lr, + uint64_t start_block, + enum region_kind kind, + unsigned int instance) +{ + if (lr->start_block != start_block) { + return uds_log_error_strerror(UDS_CORRUPT_DATA, + "incorrect layout region offset"); + } + + if ((lr->kind != kind)) { + return uds_log_error_strerror(UDS_CORRUPT_DATA, + "incorrect layout region kind"); + } + + if (lr->instance != instance) { + return uds_log_error_strerror(UDS_CORRUPT_DATA, + "incorrect layout region instance"); + } + + return UDS_SUCCESS; +} + +static int __must_check verify_sub_index(struct index_layout *layout, + uint64_t start_block, + struct region_table *table) +{ + int result; + unsigned int i; + struct sub_index_layout *sil = &layout->index; + uint64_t next_block = start_block; + + sil->sub_index = table->regions[2]; + result = verify_region(&sil->sub_index, + next_block, + RL_KIND_INDEX, + 0); + if (result != UDS_SUCCESS) { + return result; + } + + define_sub_index_nonce(layout); + + sil->volume = table->regions[3]; + result = verify_region(&sil->volume, + next_block, + RL_KIND_VOLUME, + RL_SOLE_INSTANCE); + if (result != UDS_SUCCESS) { + return result; + } + + next_block += sil->volume.block_count + layout->super.volume_offset; + + for (i = 0; i < layout->super.max_saves; i++) { + sil->saves[i].index_save = table->regions[i + 4]; + result = verify_region(&sil->saves[i].index_save, + next_block, + RL_KIND_SAVE, + i); + if (result != UDS_SUCCESS) { + return result; + } + + next_block += sil->saves[i].index_save.block_count; + } + + next_block -= layout->super.volume_offset; + if (next_block != start_block + sil->sub_index.block_count) { + return uds_log_error_strerror(UDS_CORRUPT_DATA, + "sub index region does not span all saves"); + } + + return UDS_SUCCESS; +} + +static int __must_check reconstitute_layout(struct index_layout *layout, + struct region_table *table, + uint64_t first_block) +{ + int result; + uint64_t next_block = first_block; + + result = UDS_ALLOCATE(layout->super.max_saves, + struct index_save_layout, + __func__, + &layout->index.saves); + if (result != UDS_SUCCESS) { + return result; + } + + layout->total_blocks = table->header.region_blocks; + + layout->header = table->regions[0]; + result = verify_region(&layout->header, + next_block++, + RL_KIND_HEADER, + RL_SOLE_INSTANCE); + if (result != UDS_SUCCESS) { + return result; + } + + layout->config = table->regions[1]; + result = verify_region(&layout->config, + next_block++, + RL_KIND_CONFIG, + RL_SOLE_INSTANCE); + if (result != UDS_SUCCESS) { + return result; + } + + result = verify_sub_index(layout, next_block, table); + if (result != UDS_SUCCESS) { + return result; + } + next_block += layout->index.sub_index.block_count; + + layout->seal = table->regions[table->header.region_count - 1]; + result = verify_region(&layout->seal, + next_block + layout->super.volume_offset, + RL_KIND_SEAL, + RL_SOLE_INSTANCE); + if (result != UDS_SUCCESS) { + return result; + } + + if (++next_block != (first_block + layout->total_blocks)) { + return uds_log_error_strerror(UDS_CORRUPT_DATA, + "layout table does not span total blocks"); + } + + return UDS_SUCCESS; +} + +static int __must_check load_super_block(struct index_layout *layout, + size_t block_size, + uint64_t first_block, + struct buffered_reader *reader) +{ + int result; + struct region_table *table = NULL; + struct super_block_data *super = &layout->super; + + result = load_region_table(reader, &table); + if (result != UDS_SUCCESS) { + return result; + } + + if (table->header.type != RH_TYPE_SUPER) { + UDS_FREE(table); + return uds_log_error_strerror(UDS_CORRUPT_DATA, + "not a superblock region table"); + } + + result = read_super_block_data(reader, layout, table->header.payload); + if (result != UDS_SUCCESS) { + UDS_FREE(table); + return uds_log_error_strerror(result, + "unknown superblock format"); + } + + if (super->block_size != block_size) { + UDS_FREE(table); + return uds_log_error_strerror(UDS_CORRUPT_DATA, + "superblock saved block_size %u differs from supplied block_size %zu", + super->block_size, + block_size); + } + + first_block -= (super->volume_offset - super->start_offset); + result = reconstitute_layout(layout, table, first_block); + UDS_FREE(table); + return result; +} + +static int __must_check +decode_index_save_data(struct buffer *buffer, + struct index_save_data *save_data) +{ + int result; + + result = get_uint64_le_from_buffer(buffer, &save_data->timestamp); + if (result != UDS_SUCCESS) { + return result; + } + result = get_uint64_le_from_buffer(buffer, &save_data->nonce); + if (result != UDS_SUCCESS) { + return result; + } + result = get_uint32_le_from_buffer(buffer, &save_data->version); + if (result != UDS_SUCCESS) { + return result; + } + return skip_forward(buffer, sizeof(uint32_t)); +} + +static int decode_index_state_data(struct buffer *buffer, + struct index_state_data301 *state_data) +{ + int result; + struct index_state_version file_version; + + result = get_int32_le_from_buffer(buffer, &file_version.signature); + if (result != UDS_SUCCESS) { + return result; + } + + result = get_int32_le_from_buffer(buffer, &file_version.version_id); + if (result != UDS_SUCCESS) { + return result; + } + + if ((file_version.signature != INDEX_STATE_VERSION_301.signature) || + (file_version.version_id != INDEX_STATE_VERSION_301.version_id)) { + return uds_log_error_strerror(UDS_UNSUPPORTED_VERSION, + "index state version %d,%d is unsupported", + file_version.signature, + file_version.version_id); + } + + result = get_uint64_le_from_buffer(buffer, + &state_data->newest_chapter); + if (result != UDS_SUCCESS) { + return result; + } + + result = get_uint64_le_from_buffer(buffer, + &state_data->oldest_chapter); + if (result != UDS_SUCCESS) { + return result; + } + + result = get_uint64_le_from_buffer(buffer, &state_data->last_save); + if (result != UDS_SUCCESS) { + return result; + } + + result = skip_forward(buffer, sizeof(uint32_t)); + if (result != UDS_SUCCESS) { + return result; + } + + return skip_forward(buffer, sizeof(uint32_t)); +} + +static int __must_check read_index_save_data(struct buffered_reader *reader, + struct index_save_layout *isl, + size_t saved_size) +{ + int result; + struct buffer *buffer = NULL; + uint16_t payload_size = (sizeof(struct index_save_data) + + sizeof(struct index_state_data301)); + + if (saved_size != payload_size) { + return uds_log_error_strerror(UDS_CORRUPT_DATA, + "unexpected index save data size %zu", + saved_size); + } + + result = make_buffer(payload_size, &buffer); + if (result != UDS_SUCCESS) { + return result; + } + + result = read_from_buffered_reader(reader, + get_buffer_contents(buffer), + buffer_length(buffer)); + if (result != UDS_SUCCESS) { + free_buffer(UDS_FORGET(buffer)); + return uds_log_error_strerror(result, + "cannot read index save data"); + } + + result = reset_buffer_end(buffer, buffer_length(buffer)); + if (result != UDS_SUCCESS) { + free_buffer(UDS_FORGET(buffer)); + return result; + } + + result = decode_index_save_data(buffer, &isl->save_data); + if (result != UDS_SUCCESS) { + free_buffer(UDS_FORGET(buffer)); + return result; + } + + if (isl->save_data.version > 1) { + free_buffer(UDS_FORGET(buffer)); + return uds_log_error_strerror(UDS_UNSUPPORTED_VERSION, + "unknown index save version number %u", + isl->save_data.version); + } + + result = decode_index_state_data(buffer, &isl->state_data); + free_buffer(UDS_FORGET(buffer)); + return result; +} + +static void populate_index_save_layout(struct index_save_layout *isl, + struct super_block_data *super) +{ + unsigned int z; + uint64_t free_blocks; + uint64_t volume_index_blocks; + uint64_t next_block = isl->index_save.start_block; + + isl->header = (struct layout_region) { + .start_block = next_block++, + .block_count = 1, + .kind = RL_KIND_HEADER, + .instance = RL_SOLE_INSTANCE, + }; + + isl->index_page_map = (struct layout_region) { + .start_block = next_block, + .block_count = super->page_map_blocks, + .kind = RL_KIND_INDEX_PAGE_MAP, + .instance = RL_SOLE_INSTANCE, + }; + next_block += super->page_map_blocks; + + free_blocks = (isl->index_save.block_count - 1 - + super->page_map_blocks - + super->open_chapter_blocks); + volume_index_blocks = free_blocks / isl->zone_count; + for (z = 0; z < isl->zone_count; ++z) { + isl->volume_index_zones[z] = (struct layout_region) { + .start_block = next_block, + .block_count = volume_index_blocks, + .kind = RL_KIND_VOLUME_INDEX, + .instance = z, + }; + + next_block += volume_index_blocks; + free_blocks -= volume_index_blocks; + } + + isl->open_chapter = (struct layout_region) { + .start_block = next_block, + .block_count = super->open_chapter_blocks, + .kind = RL_KIND_OPEN_CHAPTER, + .instance = RL_SOLE_INSTANCE, + }; + + next_block += super->open_chapter_blocks; + + isl->free_space = (struct layout_region) { + .start_block = next_block, + .block_count = free_blocks, + .kind = RL_KIND_EMPTY, + .instance = RL_SOLE_INSTANCE, + }; +} + +static int __must_check reconstruct_index_save(struct index_save_layout *isl, + struct region_table *table) +{ + int result; + unsigned int z; + struct layout_region *last_region; + uint64_t next_block = isl->index_save.start_block; + uint64_t last_block = next_block + isl->index_save.block_count; + + isl->zone_count = table->header.region_count - 3; + + last_region = &table->regions[table->header.region_count - 1]; + if (last_region->kind == RL_KIND_EMPTY) { + isl->free_space = *last_region; + isl->zone_count--; + } else { + isl->free_space = (struct layout_region) { + .start_block = last_block, + .block_count = 0, + .kind = RL_KIND_EMPTY, + .instance = RL_SOLE_INSTANCE, + }; + } + + isl->header = table->regions[0]; + result = verify_region(&isl->header, + next_block++, + RL_KIND_HEADER, + RL_SOLE_INSTANCE); + if (result != UDS_SUCCESS) { + return result; + } + + isl->index_page_map = table->regions[1]; + result = verify_region(&isl->index_page_map, + next_block, + RL_KIND_INDEX_PAGE_MAP, + RL_SOLE_INSTANCE); + if (result != UDS_SUCCESS) { + return result; + } + + next_block += isl->index_page_map.block_count; + + for (z = 0; z < isl->zone_count; z++) { + isl->volume_index_zones[z] = table->regions[z + 2]; + result = verify_region(&isl->volume_index_zones[z], + next_block, + RL_KIND_VOLUME_INDEX, + z); + if (result != UDS_SUCCESS) { + return result; + } + + next_block += isl->volume_index_zones[z].block_count; + } + + isl->open_chapter = table->regions[isl->zone_count + 2]; + result = verify_region(&isl->open_chapter, + next_block, + RL_KIND_OPEN_CHAPTER, + RL_SOLE_INSTANCE); + if (result != UDS_SUCCESS) { + return result; + } + + next_block += isl->open_chapter.block_count; + + result = verify_region(&isl->free_space, + next_block, + RL_KIND_EMPTY, + RL_SOLE_INSTANCE); + if (result != UDS_SUCCESS) { + return result; + } + + next_block += isl->free_space.block_count; + if (next_block != last_block) { + return uds_log_error_strerror(UDS_CORRUPT_DATA, + "index save layout table incomplete"); + } + + return UDS_SUCCESS; +} + +static void reset_index_save_layout(struct index_save_layout *isl, + uint64_t page_map_blocks) +{ + uint64_t free_blocks; + uint64_t next_block = isl->index_save.start_block; + + isl->zone_count = 0; + memset(&isl->save_data, 0, sizeof(isl->save_data)); + + isl->header = (struct layout_region) { + .start_block = next_block++, + .block_count = 1, + .kind = RL_KIND_HEADER, + .instance = RL_SOLE_INSTANCE, + }; + + isl->index_page_map = (struct layout_region) { + .start_block = next_block, + .block_count = page_map_blocks, + .kind = RL_KIND_INDEX_PAGE_MAP, + .instance = RL_SOLE_INSTANCE, + }; + + next_block += page_map_blocks; + + free_blocks = isl->index_save.block_count - page_map_blocks - 1; + isl->free_space = (struct layout_region) { + .start_block = next_block, + .block_count = free_blocks, + .kind = RL_KIND_EMPTY, + .instance = RL_SOLE_INSTANCE, + }; +} + +static int __must_check load_index_save(struct index_save_layout *isl, + struct buffered_reader *reader, + unsigned int instance) +{ + int result; + struct region_table *table = NULL; + + result = load_region_table(reader, &table); + if (result != UDS_SUCCESS) { + return uds_log_error_strerror(result, + "cannot read index save %u header", + instance); + } + + if (table->header.region_blocks != isl->index_save.block_count) { + uint64_t region_blocks = table->header.region_blocks; + + UDS_FREE(table); + return uds_log_error_strerror(UDS_CORRUPT_DATA, + "unexpected index save %u region block count %llu", + instance, + (unsigned long long) region_blocks); + } + + if (table->header.type == RH_TYPE_UNSAVED) { + UDS_FREE(table); + reset_index_save_layout(isl, 0); + return UDS_SUCCESS; + } + + + if (table->header.type != RH_TYPE_SAVE) { + UDS_FREE(table); + return uds_log_error_strerror(UDS_CORRUPT_DATA, + "unexpected index save %u header type %u", + instance, + table->header.type); + } + + result = read_index_save_data(reader, isl, table->header.payload); + if (result != UDS_SUCCESS) { + UDS_FREE(table); + return uds_log_error_strerror(result, + "unknown index save %u data format", + instance); + } + + result = reconstruct_index_save(isl, table); + UDS_FREE(table); + if (result != UDS_SUCCESS) { + return uds_log_error_strerror(result, + "cannot reconstruct index save %u", + instance); + } + + return UDS_SUCCESS; +} + +static int __must_check load_sub_index_regions(struct index_layout *layout) +{ + int result; + unsigned int j; + struct index_save_layout *isl; + struct buffered_reader *reader; + + for (j = 0; j < layout->super.max_saves; ++j) { + isl = &layout->index.saves[j]; + result = open_region_reader(layout, + &isl->index_save, + &reader); + + if (result != UDS_SUCCESS) { + uds_log_error_strerror(result, + "cannot get reader for index 0 save %u", + j); + return result; + } + + result = load_index_save(isl, reader, j); + free_buffered_reader(reader); + if (result != UDS_SUCCESS) { + /* Another save slot might be valid. */ + reset_index_save_layout(isl, 0); + continue; + } + } + + return UDS_SUCCESS; +} + +static int __must_check +verify_uds_index_config(struct index_layout *layout, + struct configuration *config) +{ + int result; + struct buffered_reader *reader = NULL; + uint64_t offset = layout->super.volume_offset - + layout->super.start_offset; + + result = open_layout_reader(layout, + &layout->config, + offset, + &reader); + if (result != UDS_SUCCESS) { + return uds_log_error_strerror(result, + "failed to open config reader"); + } + + result = validate_config_contents(reader, config); + if (result != UDS_SUCCESS) { + free_buffered_reader(reader); + return uds_log_error_strerror(result, + "failed to read config region"); + } + free_buffered_reader(reader); + return UDS_SUCCESS; +} + +static int load_index_layout(struct index_layout *layout, + struct configuration *config) +{ + int result; + struct buffered_reader *reader; + + result = open_uds_buffered_reader(layout->factory, + layout->offset, + UDS_BLOCK_SIZE, + &reader); + if (result != UDS_SUCCESS) { + return uds_log_error_strerror(result, + "unable to read superblock"); + } + + result = load_super_block(layout, + UDS_BLOCK_SIZE, + layout->offset / UDS_BLOCK_SIZE, + reader); + free_buffered_reader(reader); + if (result != UDS_SUCCESS) { + return result; + } + + result = load_sub_index_regions(layout); + if (result != UDS_SUCCESS) { + return result; + } + + return verify_uds_index_config(layout, config); +} + +static void generate_super_block_data(struct save_layout_sizes *sls, + struct super_block_data *super) +{ + memset(super, 0, sizeof(*super)); + memcpy(super->magic_label, LAYOUT_MAGIC, MAGIC_SIZE); + create_unique_nonce_data(super->nonce_info); + + super->nonce = generate_primary_nonce(super->nonce_info, + sizeof(super->nonce_info)); + super->version = SUPER_VERSION_CURRENT; + super->block_size = sls->block_size; + super->index_count = 1; + super->max_saves = sls->save_count; + super->open_chapter_blocks = sls->open_chapter_blocks; + super->page_map_blocks = sls->page_map_blocks; + super->volume_offset = 0; + super->start_offset = 0; +} + +static void setup_sub_index(struct index_layout *layout, + uint64_t start_block, + struct save_layout_sizes *sls) +{ + struct sub_index_layout *sil = &layout->index; + uint64_t next_block = start_block; + unsigned int i; + + sil->sub_index = (struct layout_region) { + .start_block = start_block, + .block_count = sls->sub_index_blocks, + .kind = RL_KIND_INDEX, + .instance = 0, + }; + + sil->volume = (struct layout_region) { + .start_block = next_block, + .block_count = sls->volume_blocks, + .kind = RL_KIND_VOLUME, + .instance = RL_SOLE_INSTANCE, + }; + + next_block += sls->volume_blocks; + + for (i = 0; i < sls->save_count; ++i) { + sil->saves[i].index_save = (struct layout_region) { + .start_block = next_block, + .block_count = sls->save_blocks, + .kind = RL_KIND_SAVE, + .instance = i, + }; + + next_block += sls->save_blocks; + } + + define_sub_index_nonce(layout); +} + +static void initialize_layout(struct index_layout *layout, + struct save_layout_sizes *sls) +{ + uint64_t next_block = layout->offset / sls->block_size; + + layout->total_blocks = sls->total_blocks; + generate_super_block_data(sls, &layout->super); + layout->header = (struct layout_region) { + .start_block = next_block++, + .block_count = 1, + .kind = RL_KIND_HEADER, + .instance = RL_SOLE_INSTANCE, + }; + + layout->config = (struct layout_region) { + .start_block = next_block++, + .block_count = 1, + .kind = RL_KIND_CONFIG, + .instance = RL_SOLE_INSTANCE, + }; + + setup_sub_index(layout, next_block, sls); + next_block += sls->sub_index_blocks; + + layout->seal = (struct layout_region) { + .start_block = next_block, + .block_count = 1, + .kind = RL_KIND_SEAL, + .instance = RL_SOLE_INSTANCE, + }; +} + +static int __must_check +make_layout_region_table(struct index_layout *layout, + struct region_table **table_ptr) +{ + int result; + unsigned int i; + /* Regions: header, config, index, volume, saves, seal */ + uint16_t region_count = 5 + layout->super.max_saves; + uint16_t payload; + struct region_table *table; + struct layout_region *lr; + + result = UDS_ALLOCATE_EXTENDED(struct region_table, + region_count, + struct layout_region, + "layout region table", + &table); + if (result != UDS_SUCCESS) { + return result; + } + + lr = &table->regions[0]; + *lr++ = layout->header; + *lr++ = layout->config; + *lr++ = layout->index.sub_index; + *lr++ = layout->index.volume; + + for (i = 0; i < layout->super.max_saves; i++) { + *lr++ = layout->index.saves[i].index_save; + } + + *lr++ = layout->seal; + + if (is_converted_super_block(&layout->super)) { + payload = sizeof(struct super_block_data); + } else { + payload = (sizeof(struct super_block_data) - + sizeof(layout->super.volume_offset) - + sizeof(layout->super.start_offset)); + } + + table->header = (struct region_header) { + .magic = REGION_MAGIC, + .region_blocks = layout->total_blocks, + .type = RH_TYPE_SUPER, + .version = 1, + .region_count = region_count, + .payload = payload, + }; + + *table_ptr = table; + return UDS_SUCCESS; +} + +static int __must_check +encode_index_save_data(struct buffer *buffer, + struct index_save_data *save_data) +{ + int result; + + result = put_uint64_le_into_buffer(buffer, save_data->timestamp); + if (result != UDS_SUCCESS) { + return result; + } + + result = put_uint64_le_into_buffer(buffer, save_data->nonce); + if (result != UDS_SUCCESS) { + return result; + } + + result = put_uint32_le_into_buffer(buffer, save_data->version); + if (result != UDS_SUCCESS) { + return result; + } + + return zero_bytes(buffer, sizeof(uint32_t)); +} + +static int __must_check +encode_index_state_data(struct buffer *buffer, + struct index_state_data301 *state_data) +{ + int result; + + result = put_uint32_le_into_buffer(buffer, + INDEX_STATE_VERSION_301.signature); + if (result != UDS_SUCCESS) { + return result; + } + + result = put_uint32_le_into_buffer(buffer, + INDEX_STATE_VERSION_301.version_id); + if (result != UDS_SUCCESS) { + return result; + } + + result = put_uint64_le_into_buffer(buffer, state_data->newest_chapter); + if (result != UDS_SUCCESS) { + return result; + } + + result = put_uint64_le_into_buffer(buffer, state_data->oldest_chapter); + if (result != UDS_SUCCESS) { + return result; + } + + result = put_uint64_le_into_buffer(buffer, state_data->last_save); + if (result != UDS_SUCCESS) { + return result; + } + + result = zero_bytes(buffer, sizeof(uint32_t)); + if (result != UDS_SUCCESS) { + return result; + } + + return zero_bytes(buffer, sizeof(uint32_t)); +} + +static int __must_check +encode_region_header(struct buffer *buffer, struct region_header *header) +{ + int result; + + result = put_uint64_le_into_buffer(buffer, REGION_MAGIC); + if (result != UDS_SUCCESS) { + return result; + } + + result = put_uint64_le_into_buffer(buffer, header->region_blocks); + if (result != UDS_SUCCESS) { + return result; + } + + result = put_uint16_le_into_buffer(buffer, header->type); + if (result != UDS_SUCCESS) { + return result; + } + + result = put_uint16_le_into_buffer(buffer, header->version); + if (result != UDS_SUCCESS) { + return result; + } + + result = put_uint16_le_into_buffer(buffer, header->region_count); + if (result != UDS_SUCCESS) { + return result; + } + + return put_uint16_le_into_buffer(buffer, header->payload); +} + +static int __must_check +encode_layout_region(struct buffer *buffer, struct layout_region *region) +{ + int result; + + result = put_uint64_le_into_buffer(buffer, region->start_block); + if (result != UDS_SUCCESS) { + return result; + } + + result = put_uint64_le_into_buffer(buffer, region->block_count); + if (result != UDS_SUCCESS) { + return result; + } + + result = zero_bytes(buffer, 4); + if (result != UDS_SUCCESS) { + return result; + } + + result = put_uint16_le_into_buffer(buffer, region->kind); + if (result != UDS_SUCCESS) { + return result; + } + + return put_uint16_le_into_buffer(buffer, region->instance); +} + +static int __must_check encode_super_block_data(struct buffer *buffer, + struct super_block_data *super) +{ + int result; + + result = put_bytes(buffer, MAGIC_SIZE, &super->magic_label); + if (result != UDS_SUCCESS) { + return result; + } + + result = put_bytes(buffer, NONCE_INFO_SIZE, &super->nonce_info); + if (result != UDS_SUCCESS) { + return result; + } + + result = put_uint64_le_into_buffer(buffer, super->nonce); + if (result != UDS_SUCCESS) { + return result; + } + + result = put_uint32_le_into_buffer(buffer, super->version); + if (result != UDS_SUCCESS) { + return result; + } + + result = put_uint32_le_into_buffer(buffer, super->block_size); + if (result != UDS_SUCCESS) { + return result; + } + + result = put_uint16_le_into_buffer(buffer, super->index_count); + if (result != UDS_SUCCESS) { + return result; + } + + result = put_uint16_le_into_buffer(buffer, super->max_saves); + if (result != UDS_SUCCESS) { + return result; + } + + result = zero_bytes(buffer, 4); + if (result != UDS_SUCCESS) { + return result; + } + + result = put_uint64_le_into_buffer(buffer, super->open_chapter_blocks); + if (result != UDS_SUCCESS) { + return result; + } + + result = put_uint64_le_into_buffer(buffer, super->page_map_blocks); + if (result != UDS_SUCCESS) { + return result; + } + + if (is_converted_super_block(super)) { + result = put_uint64_le_into_buffer(buffer, + super->volume_offset); + if (result != UDS_SUCCESS) { + return result; + } + + result = put_uint64_le_into_buffer(buffer, + super->start_offset); + if (result != UDS_SUCCESS) { + return result; + } + } + + return UDS_SUCCESS; +} + +static int __must_check +make_index_save_region_table(struct index_save_layout *isl, + struct region_table **table_ptr) +{ + int result; + unsigned int z; + struct region_table *table; + struct layout_region *lr; + uint16_t region_count; + size_t payload; + size_t type; + + if (isl->zone_count > 0) { + /* + * Normal save regions: header, page map, volume index zones, + * open chapter, and possibly free space. + */ + region_count = 3 + isl->zone_count; + if (isl->free_space.block_count > 0) { + region_count++; + } + + payload = sizeof(isl->save_data) + sizeof(isl->state_data); + type = RH_TYPE_SAVE; + } else { + /* Empty save regions: header, page map, free space. */ + region_count = 3; + payload = sizeof(isl->save_data); + type = RH_TYPE_UNSAVED; + } + + result = UDS_ALLOCATE_EXTENDED(struct region_table, + region_count, + struct layout_region, + "layout region table for ISL", + &table); + if (result != UDS_SUCCESS) { + return result; + } + + lr = &table->regions[0]; + *lr++ = isl->header; + *lr++ = isl->index_page_map; + for (z = 0; z < isl->zone_count; ++z) { + *lr++ = isl->volume_index_zones[z]; + } + + if (isl->zone_count > 0) { + *lr++ = isl->open_chapter; + } + + if (isl->free_space.block_count > 0) { + *lr++ = isl->free_space; + } + + table->header = (struct region_header) { + .magic = REGION_MAGIC, + .region_blocks = isl->index_save.block_count, + .type = type, + .version = 1, + .region_count = region_count, + .payload = payload, + }; + + *table_ptr = table; + return UDS_SUCCESS; +} + +static int __must_check +write_index_save_header(struct index_save_layout *isl, + struct region_table *table, + struct buffered_writer *writer) +{ + unsigned int i; + struct buffer *buffer; + int result; + size_t table_size = sizeof(struct region_table) + + table->header.region_count * sizeof(struct layout_region); + + result = make_buffer(table_size + table->header.payload, &buffer); + if (result != UDS_SUCCESS) { + return result; + } + + result = encode_region_header(buffer, &table->header); + if (result != UDS_SUCCESS) { + free_buffer(UDS_FORGET(buffer)); + return result; + } + + for (i = 0; i < table->header.region_count; i++) { + result = encode_layout_region(buffer, &table->regions[i]); + if (result != UDS_SUCCESS) { + free_buffer(UDS_FORGET(buffer)); + return result; + } + } + + result = encode_index_save_data(buffer, &isl->save_data); + if (result != UDS_SUCCESS) { + free_buffer(UDS_FORGET(buffer)); + return result; + } + + if (isl->zone_count > 0) { + result = encode_index_state_data(buffer, &isl->state_data); + if (result != UDS_SUCCESS) { + free_buffer(UDS_FORGET(buffer)); + return result; + } + } + + result = write_to_buffered_writer(writer, + get_buffer_contents(buffer), + content_length(buffer)); + free_buffer(UDS_FORGET(buffer)); + if (result != UDS_SUCCESS) { + return result; + } + + return flush_buffered_writer(writer); +} + +static int write_index_save_layout(struct index_layout *layout, + struct index_save_layout *isl) +{ + struct region_table *table; + struct buffered_writer *writer = NULL; + int result = make_index_save_region_table(isl, &table); + + if (result != UDS_SUCCESS) { + return result; + } + + result = open_region_writer(layout, &isl->header, &writer); + if (result != UDS_SUCCESS) { + UDS_FREE(table); + return result; + } + + result = write_index_save_header(isl, table, writer); + UDS_FREE(table); + free_buffered_writer(writer); + + return result; +} + +static int __must_check write_layout_header(struct index_layout *layout, + struct region_table *table, + struct buffered_writer *writer) +{ + int result; + unsigned int i; + size_t table_size = sizeof(struct region_table) + + table->header.region_count * sizeof(struct layout_region); + struct buffer *buffer; + + result = make_buffer(table_size + table->header.payload, &buffer); + if (result != UDS_SUCCESS) { + return result; + } + + result = encode_region_header(buffer, &table->header); + if (result != UDS_SUCCESS) { + free_buffer(UDS_FORGET(buffer)); + return result; + } + + for (i = 0; i < table->header.region_count; i++) { + result = encode_layout_region(buffer, &table->regions[i]); + if (result != UDS_SUCCESS) { + free_buffer(UDS_FORGET(buffer)); + return result; + } + } + + result = encode_super_block_data(buffer, &layout->super); + if (result != UDS_SUCCESS) { + free_buffer(UDS_FORGET(buffer)); + return result; + } + + result = write_to_buffered_writer(writer, + get_buffer_contents(buffer), + content_length(buffer)); + free_buffer(UDS_FORGET(buffer)); + if (result != UDS_SUCCESS) { + return result; + } + + return flush_buffered_writer(writer); +} + +static int __must_check save_layout(struct index_layout *layout, off_t offset) +{ + int result; + struct buffered_writer *writer = NULL; + struct region_table *table; + + result = make_layout_region_table(layout, &table); + if (result != UDS_SUCCESS) { + return result; + } + + result = open_layout_writer(layout, &layout->header, offset, &writer); + if (result != UDS_SUCCESS) { + UDS_FREE(table); + return result; + } + + result = write_layout_header(layout, table, writer); + UDS_FREE(table); + free_buffered_writer(writer); + + return result; +} + +static int __must_check +write_uds_index_config(struct index_layout *layout, + struct configuration *config, + off_t offset) +{ + int result; + struct buffered_writer *writer = NULL; + + result = open_layout_writer(layout, &layout->config, offset, &writer); + if (result != UDS_SUCCESS) { + return uds_log_error_strerror(result, + "failed to open config region"); + } + + result = write_config_contents(writer, config, layout->super.version); + if (result != UDS_SUCCESS) { + free_buffered_writer(writer); + return uds_log_error_strerror(result, + "failed to write config region"); + } + result = flush_buffered_writer(writer); + if (result != UDS_SUCCESS) { + free_buffered_writer(writer); + return uds_log_error_strerror(result, + "cannot flush config writer"); + } + free_buffered_writer(writer); + return UDS_SUCCESS; +} + +static int create_index_layout(struct index_layout *layout, + struct configuration *config) +{ + int result; + struct save_layout_sizes sizes; + + result = compute_sizes(config, &sizes); + if (result != UDS_SUCCESS) { + return result; + } + + result = UDS_ALLOCATE(sizes.save_count, + struct index_save_layout, + __func__, + &layout->index.saves); + if (result != UDS_SUCCESS) { + return result; + } + + initialize_layout(layout, &sizes); + + result = discard_index_state_data(layout); + if (result != UDS_SUCCESS) { + return result; + } + + result = save_layout(layout, 0); + if (result != UDS_SUCCESS) { + return result; + } + + return write_uds_index_config(layout, config, 0); +} + +static int create_layout_factory(struct index_layout *layout, + const struct configuration *config, + bool new_layout) +{ + int result; + size_t writable_size; + struct io_factory *factory = NULL; + + result = make_uds_io_factory(config->name, &factory); + if (result != UDS_SUCCESS) { + return result; + } + + writable_size = get_uds_writable_size(factory) & -UDS_BLOCK_SIZE; + if (writable_size < config->size + config->offset) { + put_uds_io_factory(factory); + uds_log_error("index storage (%zu) is smaller than the requested size %zu", + writable_size, + config->size + config->offset); + return -ENOSPC; + } + + layout->factory = factory; + layout->factory_size = + (config->size > 0) ? config->size : writable_size; + layout->offset = config->offset; + return UDS_SUCCESS; +} + +int make_uds_index_layout(struct configuration *config, + bool new_layout, + struct index_layout **layout_ptr) +{ + int result; + struct index_layout *layout = NULL; + struct save_layout_sizes sizes; + + result = compute_sizes(config, &sizes); + if (result != UDS_SUCCESS) { + return result; + } + + result = UDS_ALLOCATE(1, struct index_layout, __func__, &layout); + if (result != UDS_SUCCESS) { + return result; + } + + result = create_layout_factory(layout, config, new_layout); + if (result != UDS_SUCCESS) { + free_uds_index_layout(layout); + return result; + } + + if (layout->factory_size < sizes.total_size) { + uds_log_error("index storage (%zu) is smaller than the required size %llu", + layout->factory_size, + (unsigned long long) sizes.total_size); + free_uds_index_layout(layout); + return -ENOSPC; + } + + if (new_layout) { + result = create_index_layout(layout, config); + } else { + result = load_index_layout(layout, config); + } + if (result != UDS_SUCCESS) { + free_uds_index_layout(layout); + return result; + } + + *layout_ptr = layout; + return UDS_SUCCESS; +} + +void free_uds_index_layout(struct index_layout *layout) +{ + if (layout == NULL) { + return; + } + + UDS_FREE(layout->index.saves); + if (layout->factory != NULL) { + put_uds_io_factory(layout->factory); + } + + UDS_FREE(layout); +} + +int replace_index_layout_storage(struct index_layout *layout, + const char *name) +{ + return replace_uds_storage(layout->factory, name); +} + +/* Obtain a dm_bufio_client for the volume region. */ +int open_uds_volume_bufio(struct index_layout *layout, + size_t block_size, + unsigned int reserved_buffers, + struct dm_bufio_client **client_ptr) +{ + off_t offset = (layout->index.volume.start_block + + layout->super.volume_offset - + layout->super.start_offset) * + layout->super.block_size; + + return make_uds_bufio(layout->factory, + offset, + block_size, + reserved_buffers, + client_ptr); +} + +uint64_t get_uds_volume_nonce(struct index_layout *layout) +{ + return layout->index.nonce; +} + +static uint64_t generate_index_save_nonce(uint64_t volume_nonce, + struct index_save_layout *isl) +{ + struct save_nonce_data { + struct index_save_data data; + uint64_t offset; + } nonce_data; + byte buffer[sizeof(nonce_data)]; + size_t offset = 0; + + encode_uint64_le(buffer, &offset, isl->save_data.timestamp); + encode_uint64_le(buffer, &offset, 0); + encode_uint32_le(buffer, &offset, isl->save_data.version); + encode_uint32_le(buffer, &offset, 0U); + encode_uint64_le(buffer, &offset, isl->index_save.start_block); + ASSERT_LOG_ONLY(offset == sizeof(nonce_data), + "%zu bytes encoded of %zu expected", + offset, + sizeof(nonce_data)); + return generate_secondary_nonce(volume_nonce, buffer, sizeof(buffer)); +} + +static uint64_t validate_index_save_layout(struct index_save_layout *isl, + uint64_t volume_nonce) +{ + if ((isl->zone_count == 0) || (isl->save_data.timestamp == 0)) { + return 0; + } + if (isl->save_data.nonce != + generate_index_save_nonce(volume_nonce, isl)) { + return 0; + } + + return isl->save_data.timestamp; +} + +static void select_oldest_index_save_layout(struct sub_index_layout *sil, + unsigned int max_saves, + struct index_save_layout **isl_ptr) +{ + struct index_save_layout *oldest = NULL; + uint64_t save_time = 0; + uint64_t oldest_time = 0; + struct index_save_layout *isl; + + for (isl = sil->saves; isl < sil->saves + max_saves; ++isl) { + save_time = validate_index_save_layout(isl, sil->nonce); + if (oldest == NULL || save_time < oldest_time) { + oldest = isl; + oldest_time = save_time; + } + } + + *isl_ptr = oldest; +} + +static int __must_check +select_latest_index_save_layout(struct sub_index_layout *sil, + unsigned int max_saves, + struct index_save_layout **isl_ptr) +{ + struct index_save_layout *latest = NULL; + uint64_t save_time = 0; + uint64_t latest_time = 0; + struct index_save_layout *isl; + + for (isl = sil->saves; isl < sil->saves + max_saves; ++isl) { + save_time = validate_index_save_layout(isl, sil->nonce); + if (save_time > latest_time) { + latest = isl; + latest_time = save_time; + } + } + + if (latest == NULL) { + uds_log_error("No valid index save found"); + return UDS_INDEX_NOT_SAVED_CLEANLY; + } + + *isl_ptr = latest; + return UDS_SUCCESS; +} + +static void instantiate_index_save_layout(struct index_save_layout *isl, + struct super_block_data *super, + uint64_t volume_nonce, + unsigned int zone_count) +{ + isl->zone_count = zone_count; + populate_index_save_layout(isl, super); + + memset(&isl->save_data, 0, sizeof(isl->save_data)); + isl->save_data.timestamp = + ktime_to_ms(current_time_ns(CLOCK_REALTIME)); + isl->save_data.version = 1; + isl->save_data.nonce = generate_index_save_nonce(volume_nonce, isl); +} + +static int __must_check +invalidate_old_save(struct index_layout *layout, struct index_save_layout *isl) +{ + reset_index_save_layout(isl, layout->super.page_map_blocks); + return write_index_save_layout(layout, isl); +} + +static int setup_uds_index_save_slot(struct index_layout *layout, + unsigned int zone_count, + struct index_save_layout **isl_ptr) +{ + int result; + struct sub_index_layout *sil = &layout->index; + struct index_save_layout *isl; + + select_oldest_index_save_layout(sil, layout->super.max_saves, &isl); + + result = invalidate_old_save(layout, isl); + if (result != UDS_SUCCESS) { + return result; + } + + instantiate_index_save_layout(isl, + &layout->super, + sil->nonce, + zone_count); + + *isl_ptr = isl; + return UDS_SUCCESS; +} + +static int find_latest_uds_index_save_slot(struct index_layout *layout, + struct index_save_layout **isl) +{ + return select_latest_index_save_layout(&layout->index, + layout->super.max_saves, + isl); +} + +static void cancel_uds_index_save(struct index_save_layout *isl) +{ + memset(&isl->save_data, 0, sizeof(isl->save_data)); + memset(&isl->state_data, 0, sizeof(isl->state_data)); + isl->zone_count = 0; +} + +int load_index_state(struct index_layout *layout, struct uds_index *index) +{ + int result; + unsigned int zone; + struct index_save_layout *isl; + struct buffered_reader *readers[MAX_ZONES]; + + result = find_latest_uds_index_save_slot(layout, &isl); + if (result != UDS_SUCCESS) { + return result; + } + + index->newest_virtual_chapter = isl->state_data.newest_chapter; + index->oldest_virtual_chapter = isl->state_data.oldest_chapter; + index->last_save = isl->state_data.last_save; + + result = open_region_reader(layout, &isl->open_chapter, &readers[0]); + if (result != UDS_SUCCESS) { + return result; + } + + result = load_open_chapters(index, readers[0]); + free_buffered_reader(readers[0]); + if (result != UDS_SUCCESS) { + return result; + } + + for (zone = 0; zone < isl->zone_count; zone++) { + result = open_region_reader(layout, + &isl->volume_index_zones[zone], + &readers[zone]); + if (result != UDS_SUCCESS) { + for (; zone > 0; zone--) { + free_buffered_reader(readers[zone - 1]); + } + + return result; + } + } + + result = load_volume_index(index->volume_index, + readers, + isl->zone_count); + for (zone = 0; zone < isl->zone_count; zone++) { + free_buffered_reader(readers[zone]); + } + if (result != UDS_SUCCESS) { + return result; + } + + result = open_region_reader(layout, &isl->index_page_map, &readers[0]); + if (result != UDS_SUCCESS) { + return result; + } + + result = read_index_page_map(index->volume->index_page_map, + readers[0]); + free_buffered_reader(readers[0]); + + return result; +} + +int save_index_state(struct index_layout *layout, struct uds_index *index) +{ + int result; + unsigned int zone; + struct index_save_layout *isl; + struct buffered_writer *writers[MAX_ZONES]; + + result = setup_uds_index_save_slot(layout, index->zone_count, &isl); + if (result != UDS_SUCCESS) { + return result; + } + + isl->state_data = (struct index_state_data301) { + .newest_chapter = index->newest_virtual_chapter, + .oldest_chapter = index->oldest_virtual_chapter, + .last_save = index->last_save, + }; + + result = open_region_writer(layout, &isl->open_chapter, &writers[0]); + if (result != UDS_SUCCESS) { + cancel_uds_index_save(isl); + return result; + } + + result = save_open_chapters(index, writers[0]); + free_buffered_writer(writers[0]); + if (result != UDS_SUCCESS) { + cancel_uds_index_save(isl); + return result; + } + + for (zone = 0; zone < index->zone_count; zone++) { + result = open_region_writer(layout, + &isl->volume_index_zones[zone], + &writers[zone]); + if (result != UDS_SUCCESS) { + for (; zone > 0; zone--) { + free_buffered_writer(writers[zone - 1]); + } + + cancel_uds_index_save(isl); + return result; + } + } + + result = save_volume_index(index->volume_index, + writers, + index->zone_count); + for (zone = 0; zone < index->zone_count; zone++) { + free_buffered_writer(writers[zone]); + } + if (result != UDS_SUCCESS) { + cancel_uds_index_save(isl); + return result; + } + + result = open_region_writer(layout, &isl->index_page_map, &writers[0]); + if (result != UDS_SUCCESS) { + cancel_uds_index_save(isl); + return result; + } + + result = write_index_page_map(index->volume->index_page_map, + writers[0]); + free_buffered_writer(writers[0]); + if (result != UDS_SUCCESS) { + cancel_uds_index_save(isl); + return result; + } + + return write_index_save_layout(layout, isl); +} + +int discard_index_state_data(struct index_layout *layout) +{ + unsigned int i; + int result; + int saved_result = UDS_SUCCESS; + struct sub_index_layout *sil = &layout->index; + + for (i = 0; i < layout->super.max_saves; ++i) { + result = invalidate_old_save(layout, &sil->saves[i]); + if (result != UDS_SUCCESS) { + saved_result = result; + } + } + + if (saved_result != UDS_SUCCESS) { + return uds_log_error_strerror(result, + "%s: cannot destroy all index saves", + __func__); + } + + return UDS_SUCCESS; +} + +int discard_open_chapter(struct index_layout *layout) +{ + int result; + struct index_save_layout *isl; + struct buffered_writer *writer; + + result = find_latest_uds_index_save_slot(layout, &isl); + if (result != UDS_SUCCESS) { + return result; + } + + result = open_region_writer(layout, &isl->open_chapter, &writer); + if (result != UDS_SUCCESS) { + return result; + } + + result = write_zeros_to_buffered_writer(writer, + UDS_BLOCK_SIZE); + if (result != UDS_SUCCESS) { + free_buffered_writer(writer); + return result; + } + + result = flush_buffered_writer(writer); + free_buffered_writer(writer); + return result; +} diff --git a/vdo/index-layout.h b/vdo/index-layout.h new file mode 100644 index 00000000..700df5b1 --- /dev/null +++ b/vdo/index-layout.h @@ -0,0 +1,42 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright Red Hat + */ + +#ifndef INDEX_LAYOUT_H +#define INDEX_LAYOUT_H + +#include "buffer.h" +#include "config.h" +#include "io-factory.h" +#include "uds.h" + +struct index_layout; + +int __must_check make_uds_index_layout(struct configuration *config, + bool new_layout, + struct index_layout **layout_ptr); + +void free_uds_index_layout(struct index_layout *layout); + +int __must_check replace_index_layout_storage(struct index_layout *layout, + const char *name); + +int __must_check load_index_state(struct index_layout *layout, + struct uds_index *index); + +int __must_check save_index_state(struct index_layout *layout, + struct uds_index *index); + +int discard_index_state_data(struct index_layout *layout); + +int __must_check discard_open_chapter(struct index_layout *layout); + +uint64_t __must_check get_uds_volume_nonce(struct index_layout *layout); + +int __must_check open_uds_volume_bufio(struct index_layout *layout, + size_t block_size, + unsigned int reserved_buffers, + struct dm_bufio_client **client_ptr); + +#endif /* INDEX_LAYOUT_H */ diff --git a/vdo/index-page-map.c b/vdo/index-page-map.c new file mode 100644 index 00000000..e33af4e1 --- /dev/null +++ b/vdo/index-page-map.c @@ -0,0 +1,231 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright Red Hat + */ + +#include "index-page-map.h" + +#include "buffer.h" +#include "compiler.h" +#include "errors.h" +#include "hash-utils.h" +#include "logger.h" +#include "memory-alloc.h" +#include "permassert.h" +#include "string-utils.h" +#include "uds-threads.h" +#include "uds.h" + +/* + * Each volume maintains an index page map which records how the chapter delta + * lists are distributed among the index pages for that chapter. + * + * The map is conceptually a two-dimensional array indexed by chapter number + * and index page number within the chapter. Each entry contains the number + * of the last delta list on that index page. In order to save memory, the + * information for the last page in each chapter is not recorded, as it is + * known from the geometry. + */ + +static const byte PAGE_MAP_MAGIC[] = "ALBIPM02"; + +enum { + PAGE_MAP_MAGIC_LENGTH = sizeof(PAGE_MAP_MAGIC) - 1, +}; + +static INLINE size_t get_entry_count(const struct geometry *geometry) +{ + return (geometry->chapters_per_volume * + (geometry->index_pages_per_chapter - 1)); +} + +int make_index_page_map(const struct geometry *geometry, + struct index_page_map **map_ptr) +{ + int result; + struct index_page_map *map; + + result = UDS_ALLOCATE(1, struct index_page_map, "page map", &map); + if (result != UDS_SUCCESS) { + return result; + } + + map->geometry = geometry; + map->entries_per_chapter = geometry->index_pages_per_chapter - 1; + result = UDS_ALLOCATE(get_entry_count(geometry), + uint16_t, + "Index Page Map Entries", + &map->entries); + if (result != UDS_SUCCESS) { + free_index_page_map(map); + return result; + } + + *map_ptr = map; + return UDS_SUCCESS; +} + +void free_index_page_map(struct index_page_map *map) +{ + if (map != NULL) { + UDS_FREE(map->entries); + UDS_FREE(map); + } +} + +void update_index_page_map(struct index_page_map *map, + uint64_t virtual_chapter_number, + unsigned int chapter_number, + unsigned int index_page_number, + unsigned int delta_list_number) +{ + size_t slot; + + map->last_update = virtual_chapter_number; + if (index_page_number == map->entries_per_chapter) { + return; + } + + slot = (chapter_number * map->entries_per_chapter) + index_page_number; + map->entries[slot] = delta_list_number; +} + +unsigned int find_index_page_number(const struct index_page_map *map, + const struct uds_chunk_name *name, + unsigned int chapter_number) +{ + unsigned int delta_list_number = + hash_to_chapter_delta_list(name, map->geometry); + unsigned int slot = chapter_number * map->entries_per_chapter; + unsigned int page; + + for (page = 0; page < map->entries_per_chapter; page++) { + if (delta_list_number <= map->entries[slot + page]) { + break; + } + } + + return page; +} + +void get_list_number_bounds(const struct index_page_map *map, + unsigned int chapter_number, + unsigned int index_page_number, + unsigned int *lowest_list, + unsigned int *highest_list) +{ + unsigned int slot = chapter_number * map->entries_per_chapter; + + *lowest_list = ((index_page_number == 0) ? + 0 : + map->entries[slot + index_page_number - 1] + 1); + *highest_list = ((index_page_number < map->entries_per_chapter) ? + map->entries[slot + index_page_number] : + map->geometry->delta_lists_per_chapter - 1); +} + +uint64_t compute_index_page_map_save_size(const struct geometry *geometry) +{ + return (PAGE_MAP_MAGIC_LENGTH + sizeof(uint64_t) + + sizeof(uint16_t) * get_entry_count(geometry)); +} + +int write_index_page_map(struct index_page_map *map, + struct buffered_writer *writer) +{ + int result; + struct buffer *buffer; + + result = make_buffer(compute_index_page_map_save_size(map->geometry), + &buffer); + if (result != UDS_SUCCESS) { + return result; + } + + result = put_bytes(buffer, PAGE_MAP_MAGIC_LENGTH, PAGE_MAP_MAGIC); + if (result != UDS_SUCCESS) { + free_buffer(UDS_FORGET(buffer)); + return result; + } + + result = put_uint64_le_into_buffer(buffer, map->last_update); + if (result != UDS_SUCCESS) { + free_buffer(UDS_FORGET(buffer)); + return result; + } + + result = put_uint16_les_into_buffer(buffer, + get_entry_count(map->geometry), + map->entries); + if (result != UDS_SUCCESS) { + free_buffer(UDS_FORGET(buffer)); + return result; + } + + result = write_to_buffered_writer(writer, + get_buffer_contents(buffer), + content_length(buffer)); + free_buffer(UDS_FORGET(buffer)); + if (result != UDS_SUCCESS) { + return result; + } + + return flush_buffered_writer(writer); +} + +int read_index_page_map(struct index_page_map *map, + struct buffered_reader *reader) +{ + int result; + struct buffer *buffer; + byte magic[PAGE_MAP_MAGIC_LENGTH]; + + result = make_buffer(compute_index_page_map_save_size(map->geometry), + &buffer); + if (result != UDS_SUCCESS) { + return result; + } + + result = read_from_buffered_reader(reader, + get_buffer_contents(buffer), + buffer_length(buffer)); + if (result != UDS_SUCCESS) { + free_buffer(UDS_FORGET(buffer)); + return result; + } + + result = reset_buffer_end(buffer, buffer_length(buffer)); + if (result != UDS_SUCCESS) { + free_buffer(UDS_FORGET(buffer)); + return result; + } + + result = get_bytes_from_buffer(buffer, PAGE_MAP_MAGIC_LENGTH, &magic); + if (result != UDS_SUCCESS) { + free_buffer(UDS_FORGET(buffer)); + return result; + } + + if (memcmp(magic, PAGE_MAP_MAGIC, PAGE_MAP_MAGIC_LENGTH) != 0) { + free_buffer(UDS_FORGET(buffer)); + return UDS_CORRUPT_DATA; + } + + result = get_uint64_le_from_buffer(buffer, &map->last_update); + if (result != UDS_SUCCESS) { + free_buffer(UDS_FORGET(buffer)); + return result; + } + + result = get_uint16_les_from_buffer(buffer, + get_entry_count(map->geometry), + map->entries); + free_buffer(UDS_FORGET(buffer)); + if (result != UDS_SUCCESS) { + return result; + } + + uds_log_debug("read index page map, last update %llu", + (unsigned long long) map->last_update); + return UDS_SUCCESS; +} diff --git a/vdo/index-page-map.h b/vdo/index-page-map.h new file mode 100644 index 00000000..e86a16f6 --- /dev/null +++ b/vdo/index-page-map.h @@ -0,0 +1,51 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright Red Hat + */ + +#ifndef INDEX_PAGE_MAP_H +#define INDEX_PAGE_MAP_H 1 + +#include "buffered-reader.h" +#include "buffered-writer.h" +#include "common.h" +#include "geometry.h" + +struct index_page_map { + const struct geometry *geometry; + uint64_t last_update; + unsigned int entries_per_chapter; + uint16_t *entries; +}; + +int __must_check make_index_page_map(const struct geometry *geometry, + struct index_page_map **map_ptr); + +void free_index_page_map(struct index_page_map *map); + +int __must_check read_index_page_map(struct index_page_map *map, + struct buffered_reader *reader); + +int __must_check write_index_page_map(struct index_page_map *map, + struct buffered_writer *writer); + +void update_index_page_map(struct index_page_map *map, + uint64_t virtual_chapter_number, + unsigned int chapter_number, + unsigned int index_page_number, + unsigned int delta_list_number); + +unsigned int __must_check +find_index_page_number(const struct index_page_map *map, + const struct uds_chunk_name *name, + unsigned int chapter_number); + +void get_list_number_bounds(const struct index_page_map *map, + unsigned int chapter_number, + unsigned int index_page_number, + unsigned int *lowest_list, + unsigned int *highest_list); + +uint64_t compute_index_page_map_save_size(const struct geometry *geometry); + +#endif /* INDEX_PAGE_MAP_H */ diff --git a/vdo/index-session.c b/vdo/index-session.c new file mode 100644 index 00000000..10ca318a --- /dev/null +++ b/vdo/index-session.c @@ -0,0 +1,892 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright Red Hat + */ + +#include "index-session.h" + +#include + +#include "index.h" +#include "index-layout.h" +#include "logger.h" +#include "memory-alloc.h" +#include "request-queue.h" +#include "time-utils.h" + +/* + * The index session mediates all interactions with a UDS index. Once the + * session is created, it can be used to open, close, suspend, or recreate an + * index. The session contains a lock (the request_mutex) which ensures that + * only one thread can change the state of its index at a time. The state field + * indicates the current state of the index through a set of descriptive + * flags. The request_mutex must be notified whenever a non-transient state + * flag is cleared. The request_mutex is also used to count the number of + * requests currently in progress so that they can be drained when suspending + * or closing the index. + * + * If the index session is suspended shortly after opening an index, it may + * have to suspend during a rebuild. Depending on the size of the index, a + * rebuild may take a significant amount of time, so UDS allows the rebuild to + * be paused in order to suspend the session in a timely manner. When the index + * session is resumed, the rebuild can continue from where it left off. If the + * index session is shut down with a suspended rebuild, the rebuild progress is + * abandoned and the rebuild will start from the beginning the next time the + * index is loaded. The mutex and status fields in the index_load_context are + * used to record the state of any interrupted rebuild. + * + * If any deduplication request fails due to an internal error, the index is + * marked disabled. It will not accept any further requests and can only be + * closed. Closing the index will clear the disabled flag, and the index can + * then be reopened and recovered. + */ + +enum index_session_flag_bit { + IS_FLAG_BIT_START = 8, + /* The session has started loading an index but not completed it. */ + IS_FLAG_BIT_LOADING = IS_FLAG_BIT_START, + /* The session has loaded an index, which can handle requests. */ + IS_FLAG_BIT_LOADED, + /* The session's index has been permanently disabled. */ + IS_FLAG_BIT_DISABLED, + /* The session's index is suspended. */ + IS_FLAG_BIT_SUSPENDED, + /* The session is handling some index state change. */ + IS_FLAG_BIT_WAITING, + /* The session's index is closing and draining requests. */ + IS_FLAG_BIT_CLOSING, + /* The session is being destroyed and is draining requests. */ + IS_FLAG_BIT_DESTROYING, +}; + +enum index_session_flag { + IS_FLAG_LOADED = (1 << IS_FLAG_BIT_LOADED), + IS_FLAG_LOADING = (1 << IS_FLAG_BIT_LOADING), + IS_FLAG_DISABLED = (1 << IS_FLAG_BIT_DISABLED), + IS_FLAG_SUSPENDED = (1 << IS_FLAG_BIT_SUSPENDED), + IS_FLAG_WAITING = (1 << IS_FLAG_BIT_WAITING), + IS_FLAG_CLOSING = (1 << IS_FLAG_BIT_CLOSING), + IS_FLAG_DESTROYING = (1 << IS_FLAG_BIT_DESTROYING), +}; + +/* Release a reference to an index session. */ +static void release_index_session(struct uds_index_session *index_session) +{ + uds_lock_mutex(&index_session->request_mutex); + if (--index_session->request_count == 0) { + uds_broadcast_cond(&index_session->request_cond); + } + uds_unlock_mutex(&index_session->request_mutex); +} + +/* + * Acquire a reference to the index session for an asynchronous index request. + * The reference must eventually be released with a corresponding call to + * release_index_session(). + **/ +static int get_index_session(struct uds_index_session *index_session) +{ + unsigned int state; + int result = UDS_SUCCESS; + + uds_lock_mutex(&index_session->request_mutex); + index_session->request_count++; + state = index_session->state; + uds_unlock_mutex(&index_session->request_mutex); + + if (state == IS_FLAG_LOADED) { + return UDS_SUCCESS; + } else if (state & IS_FLAG_DISABLED) { + result = UDS_DISABLED; + } else if ((state & IS_FLAG_LOADING) || + (state & IS_FLAG_SUSPENDED) || + (state & IS_FLAG_WAITING)) { + result = -EBUSY; + } else { + result = UDS_NO_INDEX; + } + + release_index_session(index_session); + return result; +} + +int uds_start_chunk_operation(struct uds_request *request) +{ + size_t internal_size; + int result; + + if (request->callback == NULL) { + uds_log_error("missing required callback"); + return -EINVAL; + } + + switch (request->type) { + case UDS_DELETE: + case UDS_POST: + case UDS_QUERY: + case UDS_QUERY_NO_UPDATE: + case UDS_UPDATE: + break; + default: + uds_log_error("received invalid callback type"); + return -EINVAL; + } + + /* Reset all internal fields before processing. */ + internal_size = sizeof(struct uds_request) - + offsetof(struct uds_request, zone_number); + // FIXME should be using struct_group for this instead + memset((char *) request + sizeof(*request) - internal_size, + 0, internal_size); + + result = get_index_session(request->session); + if (result != UDS_SUCCESS) { + return result; + } + + request->found = false; + request->unbatched = false; + request->index = request->session->index; + + enqueue_request(request, STAGE_TRIAGE); + return UDS_SUCCESS; +} + +static void enter_callback_stage(struct uds_request *request) +{ + if (request->status != UDS_SUCCESS) { + /* All request errors are considered unrecoverable */ + uds_lock_mutex(&request->session->request_mutex); + request->session->state |= IS_FLAG_DISABLED; + uds_unlock_mutex(&request->session->request_mutex); + } + + uds_request_queue_enqueue(request->session->callback_queue, request); +} + +static INLINE void count_once(uint64_t *count_ptr) +{ + WRITE_ONCE(*count_ptr, READ_ONCE(*count_ptr) + 1); +} + +static void update_session_stats(struct uds_request *request) +{ + struct session_stats *session_stats = &request->session->stats; + + count_once(&session_stats->requests); + + switch (request->type) { + case UDS_POST: + if (request->found) { + count_once(&session_stats->posts_found); + } else { + count_once(&session_stats->posts_not_found); + } + + if (request->location == UDS_LOCATION_IN_OPEN_CHAPTER) { + count_once(&session_stats->posts_found_open_chapter); + } else if (request->location == UDS_LOCATION_IN_DENSE) { + count_once(&session_stats->posts_found_dense); + } else if (request->location == UDS_LOCATION_IN_SPARSE) { + count_once(&session_stats->posts_found_sparse); + } + break; + + case UDS_UPDATE: + if (request->found) { + count_once(&session_stats->updates_found); + } else { + count_once(&session_stats->updates_not_found); + } + break; + + case UDS_DELETE: + if (request->found) { + count_once(&session_stats->deletions_found); + } else { + count_once(&session_stats->deletions_not_found); + } + break; + + case UDS_QUERY: + case UDS_QUERY_NO_UPDATE: + if (request->found) { + count_once(&session_stats->queries_found); + } else { + count_once(&session_stats->queries_not_found); + } + break; + + default: + request->status = ASSERT(false, + "unknown request type: %d", + request->type); + } +} + +static void handle_callbacks(struct uds_request *request) +{ + struct uds_index_session *index_session = request->session; + + if (request->status == UDS_SUCCESS) { + update_session_stats(request); + } + + request->status = uds_map_to_system_error(request->status); + request->callback(request); + release_index_session(index_session); +} + +static int __must_check +make_empty_index_session(struct uds_index_session **index_session_ptr) +{ + int result; + struct uds_index_session *session; + + result = UDS_ALLOCATE(1, struct uds_index_session, __func__, &session); + if (result != UDS_SUCCESS) { + return result; + } + + result = uds_init_mutex(&session->request_mutex); + if (result != UDS_SUCCESS) { + UDS_FREE(session); + return result; + } + + result = uds_init_cond(&session->request_cond); + if (result != UDS_SUCCESS) { + uds_destroy_mutex(&session->request_mutex); + UDS_FREE(session); + return result; + } + + result = uds_init_mutex(&session->load_context.mutex); + if (result != UDS_SUCCESS) { + uds_destroy_cond(&session->request_cond); + uds_destroy_mutex(&session->request_mutex); + UDS_FREE(session); + return result; + } + + result = uds_init_cond(&session->load_context.cond); + if (result != UDS_SUCCESS) { + uds_destroy_mutex(&session->load_context.mutex); + uds_destroy_cond(&session->request_cond); + uds_destroy_mutex(&session->request_mutex); + UDS_FREE(session); + return result; + } + + result = make_uds_request_queue("callbackW", + &handle_callbacks, + &session->callback_queue); + if (result != UDS_SUCCESS) { + uds_destroy_cond(&session->load_context.cond); + uds_destroy_mutex(&session->load_context.mutex); + uds_destroy_cond(&session->request_cond); + uds_destroy_mutex(&session->request_mutex); + UDS_FREE(session); + return result; + } + + *index_session_ptr = session; + return UDS_SUCCESS; +} + +int uds_create_index_session(struct uds_index_session **session) +{ + if (session == NULL) { + uds_log_error("missing session pointer"); + return -EINVAL; + } + + return uds_map_to_system_error(make_empty_index_session(session)); +} + +static int __must_check +start_loading_index_session(struct uds_index_session *index_session) +{ + int result; + + uds_lock_mutex(&index_session->request_mutex); + if (index_session->state & IS_FLAG_SUSPENDED) { + uds_log_info("Index session is suspended"); + result = -EBUSY; + } else if (index_session->state != 0) { + uds_log_info("Index is already loaded"); + result = -EBUSY; + } else { + index_session->state |= IS_FLAG_LOADING; + result = UDS_SUCCESS; + } + uds_unlock_mutex(&index_session->request_mutex); + return result; +} + +static void +finish_loading_index_session(struct uds_index_session *index_session, + int result) +{ + uds_lock_mutex(&index_session->request_mutex); + index_session->state &= ~IS_FLAG_LOADING; + if (result == UDS_SUCCESS) { + index_session->state |= IS_FLAG_LOADED; + } + + uds_broadcast_cond(&index_session->request_cond); + uds_unlock_mutex(&index_session->request_mutex); +} + +static int initialize_index_session(struct uds_index_session *index_session, + enum uds_open_index_type open_type) +{ + int result; + struct configuration *config; + + result = make_configuration(&index_session->parameters, &config); + if (result != UDS_SUCCESS) { + uds_log_error_strerror(result, "Failed to allocate config"); + return result; + } + + memset(&index_session->stats, 0, sizeof(index_session->stats)); + result = make_index(config, + open_type, + &index_session->load_context, + enter_callback_stage, + &index_session->index); + if (result != UDS_SUCCESS) { + uds_log_error_strerror(result, "Failed to make index"); + } else { + log_uds_configuration(config); + } + + free_configuration(config); + return result; +} + +static const char *get_open_type_string(enum uds_open_index_type open_type) +{ + switch (open_type) { + case UDS_CREATE: + return "creating index"; + case UDS_LOAD: + return "loading or rebuilding index"; + case UDS_NO_REBUILD: + return "loading index"; + default: + return "unknown open method"; + } +} + +/* + * Open an index under the given session. This operation will fail if the + * index session is suspended, or if there is already an open index. + */ +int uds_open_index(enum uds_open_index_type open_type, + const struct uds_parameters *parameters, + struct uds_index_session *session) +{ + int result; + + if (parameters == NULL) { + uds_log_error("missing required parameters"); + return -EINVAL; + } + if (parameters->name == NULL) { + uds_log_error("missing required index name"); + return -EINVAL; + } + if (session == NULL) { + uds_log_error("missing required session pointer"); + return -EINVAL; + } + + result = start_loading_index_session(session); + if (result != UDS_SUCCESS) { + return uds_map_to_system_error(result); + } + + if ((session->parameters.name == NULL) || + (strcmp(parameters->name, session->parameters.name) != 0)) { + char *new_name; + + result = uds_duplicate_string(parameters->name, + "device name", + &new_name); + if (result != UDS_SUCCESS) { + finish_loading_index_session(session, result); + return uds_map_to_system_error(result); + } + + uds_free_const(session->parameters.name); + session->parameters = *parameters; + session->parameters.name = new_name; + } else { + const char *old_name = session->parameters.name; + + session->parameters = *parameters; + session->parameters.name = old_name; + } + + uds_log_notice("%s: %s", + get_open_type_string(open_type), + parameters->name); + result = initialize_index_session(session, open_type); + if (result != UDS_SUCCESS) { + uds_log_error_strerror(result, + "Failed %s", + get_open_type_string(open_type)); + } + + finish_loading_index_session(session, result); + return uds_map_to_system_error(result); +} + +static void +wait_for_no_requests_in_progress(struct uds_index_session *index_session) +{ + uds_lock_mutex(&index_session->request_mutex); + while (index_session->request_count > 0) { + uds_wait_cond(&index_session->request_cond, + &index_session->request_mutex); + } + uds_unlock_mutex(&index_session->request_mutex); +} + +static int __must_check uds_save_index(struct uds_index_session *index_session) +{ + wait_for_no_requests_in_progress(index_session); + return save_index(index_session->index); +} + +static void suspend_rebuild(struct uds_index_session *session) +{ + uds_lock_mutex(&session->load_context.mutex); + switch (session->load_context.status) { + case INDEX_OPENING: + session->load_context.status = INDEX_SUSPENDING; + + /* Wait until the index indicates that it is not replaying. */ + while ((session->load_context.status != INDEX_SUSPENDED) && + (session->load_context.status != INDEX_READY)) { + uds_wait_cond(&session->load_context.cond, + &session->load_context.mutex); + } + break; + + case INDEX_READY: + /* Index load does not need to be suspended. */ + break; + + case INDEX_SUSPENDED: + case INDEX_SUSPENDING: + case INDEX_FREEING: + default: + /* These cases should not happen. */ + ASSERT_LOG_ONLY(false, + "Bad load context state %u", + session->load_context.status); + break; + } + uds_unlock_mutex(&session->load_context.mutex); +} + +/* + * Suspend index operation, draining all current index requests and + * preventing new index requests from starting. Optionally saves all index + * data before returning. + */ +int uds_suspend_index_session(struct uds_index_session *session, bool save) +{ + int result = UDS_SUCCESS; + bool no_work = false; + bool rebuilding = false; + + /* Wait for any current index state change to complete. */ + uds_lock_mutex(&session->request_mutex); + while (session->state & IS_FLAG_CLOSING) { + uds_wait_cond(&session->request_cond, &session->request_mutex); + } + + if ((session->state & IS_FLAG_WAITING) || + (session->state & IS_FLAG_DESTROYING)) { + no_work = true; + uds_log_info("Index session is already changing state"); + result = -EBUSY; + } else if (session->state & IS_FLAG_SUSPENDED) { + no_work = true; + } else if (session->state & IS_FLAG_LOADING) { + session->state |= IS_FLAG_WAITING; + rebuilding = true; + } else if (session->state & IS_FLAG_LOADED) { + session->state |= IS_FLAG_WAITING; + } else { + no_work = true; + session->state |= IS_FLAG_SUSPENDED; + uds_broadcast_cond(&session->request_cond); + } + uds_unlock_mutex(&session->request_mutex); + + if (no_work) { + return uds_map_to_system_error(result); + } + + if (rebuilding) { + suspend_rebuild(session); + } else if (save) { + result = uds_save_index(session); + } else { + result = uds_flush_index_session(session); + } + + uds_lock_mutex(&session->request_mutex); + session->state &= ~IS_FLAG_WAITING; + session->state |= IS_FLAG_SUSPENDED; + uds_broadcast_cond(&session->request_cond); + uds_unlock_mutex(&session->request_mutex); + return uds_map_to_system_error(result); +} + +static int replace_device(struct uds_index_session *session, + const char *name) +{ + int result; + char *new_name; + + result = uds_duplicate_string(name, "device name", &new_name); + if (result != UDS_SUCCESS) { + return result; + } + + result = replace_index_storage(session->index, name); + if (result != UDS_SUCCESS) { + UDS_FREE(new_name); + return result; + } + + uds_free_const(session->parameters.name); + session->parameters.name = new_name; + return UDS_SUCCESS; +} + +/* + * Resume index operation after being suspended. If the index is suspended + * and the supplied name is different from the current backing store, the + * index will start using the new backing store. + */ +int uds_resume_index_session(struct uds_index_session *session, + const char *name) +{ + int result = UDS_SUCCESS; + bool no_work = false; + bool resume_replay = false; + + uds_lock_mutex(&session->request_mutex); + if (session->state & IS_FLAG_WAITING) { + uds_log_info("Index session is already changing state"); + no_work = true; + result = -EBUSY; + } else if (!(session->state & IS_FLAG_SUSPENDED)) { + /* If not suspended, just succeed. */ + no_work = true; + result = UDS_SUCCESS; + } else { + session->state |= IS_FLAG_WAITING; + if (session->state & IS_FLAG_LOADING) { + resume_replay = true; + } + } + uds_unlock_mutex(&session->request_mutex); + + if (no_work) { + return result; + } + + if ((name != NULL) && (session->index != NULL) && + (strcmp(name, session->parameters.name) != 0)) { + result = replace_device(session, name); + if (result != UDS_SUCCESS) { + uds_lock_mutex(&session->request_mutex); + session->state &= ~IS_FLAG_WAITING; + uds_broadcast_cond(&session->request_cond); + uds_unlock_mutex(&session->request_mutex); + return uds_map_to_system_error(result); + } + } + + if (resume_replay) { + uds_lock_mutex(&session->load_context.mutex); + switch (session->load_context.status) { + case INDEX_SUSPENDED: + session->load_context.status = INDEX_OPENING; + /* Notify the index to start replaying again. */ + uds_broadcast_cond(&session->load_context.cond); + break; + + case INDEX_READY: + /* There is no index rebuild to resume. */ + break; + + case INDEX_OPENING: + case INDEX_SUSPENDING: + case INDEX_FREEING: + default: + /* These cases should not happen; do nothing. */ + ASSERT_LOG_ONLY(false, + "Bad load context state %u", + session->load_context.status); + break; + } + uds_unlock_mutex(&session->load_context.mutex); + } + + uds_lock_mutex(&session->request_mutex); + session->state &= ~IS_FLAG_WAITING; + session->state &= ~IS_FLAG_SUSPENDED; + uds_broadcast_cond(&session->request_cond); + uds_unlock_mutex(&session->request_mutex); + return UDS_SUCCESS; +} + +static int save_and_free_index(struct uds_index_session *index_session) +{ + int result = UDS_SUCCESS; + bool suspended; + struct uds_index *index = index_session->index; + + if (index == NULL) { + return UDS_SUCCESS; + } + + uds_lock_mutex(&index_session->request_mutex); + suspended = (index_session->state & IS_FLAG_SUSPENDED); + uds_unlock_mutex(&index_session->request_mutex); + + if (!suspended) { + result = save_index(index); + if (result != UDS_SUCCESS) { + uds_log_warning_strerror(result, + "ignoring error from save_index"); + } + } + free_index(index); + index_session->index = NULL; + + /* + * Reset all index state that happens to be in the index + * session, so it doesn't affect any future index. + */ + uds_lock_mutex(&index_session->load_context.mutex); + index_session->load_context.status = INDEX_OPENING; + uds_unlock_mutex(&index_session->load_context.mutex); + + uds_lock_mutex(&index_session->request_mutex); + /* Only the suspend bit will remain relevant. */ + index_session->state &= IS_FLAG_SUSPENDED; + uds_unlock_mutex(&index_session->request_mutex); + + return result; +} + +/* Save and close the current index. */ +int uds_close_index(struct uds_index_session *index_session) +{ + int result = UDS_SUCCESS; + + /* Wait for any current index state change to complete. */ + uds_lock_mutex(&index_session->request_mutex); + while ((index_session->state & IS_FLAG_WAITING) || + (index_session->state & IS_FLAG_CLOSING)) { + uds_wait_cond(&index_session->request_cond, + &index_session->request_mutex); + } + + if (index_session->state & IS_FLAG_SUSPENDED) { + uds_log_info("Index session is suspended"); + result = -EBUSY; + } else if ((index_session->state & IS_FLAG_DESTROYING) || + !(index_session->state & IS_FLAG_LOADED)) { + /* + * The index doesn't exist, hasn't finished loading, or is + * being destroyed. + */ + result = UDS_NO_INDEX; + } else { + index_session->state |= IS_FLAG_CLOSING; + } + uds_unlock_mutex(&index_session->request_mutex); + if (result != UDS_SUCCESS) { + return uds_map_to_system_error(result); + } + + uds_log_debug("Closing index"); + wait_for_no_requests_in_progress(index_session); + result = save_and_free_index(index_session); + uds_log_debug("Closed index"); + + uds_lock_mutex(&index_session->request_mutex); + index_session->state &= ~IS_FLAG_CLOSING; + uds_broadcast_cond(&index_session->request_cond); + uds_unlock_mutex(&index_session->request_mutex); + return uds_map_to_system_error(result); +} + +/* This will save and close an open index before destroying the session. */ +int uds_destroy_index_session(struct uds_index_session *index_session) +{ + int result; + bool load_pending = false; + + uds_log_debug("Destroying index session"); + + /* Wait for any current index state change to complete. */ + uds_lock_mutex(&index_session->request_mutex); + while ((index_session->state & IS_FLAG_WAITING) || + (index_session->state & IS_FLAG_CLOSING)) { + uds_wait_cond(&index_session->request_cond, + &index_session->request_mutex); + } + + if (index_session->state & IS_FLAG_DESTROYING) { + uds_unlock_mutex(&index_session->request_mutex); + uds_log_info("Index session is already closing"); + return -EBUSY; + } + + index_session->state |= IS_FLAG_DESTROYING; + load_pending = ((index_session->state & IS_FLAG_LOADING) && + (index_session->state & IS_FLAG_SUSPENDED)); + uds_unlock_mutex(&index_session->request_mutex); + + if (load_pending) { + /* Tell the index to terminate the rebuild. */ + uds_lock_mutex(&index_session->load_context.mutex); + if (index_session->load_context.status == INDEX_SUSPENDED) { + index_session->load_context.status = INDEX_FREEING; + uds_broadcast_cond(&index_session->load_context.cond); + } + uds_unlock_mutex(&index_session->load_context.mutex); + + /* Wait until the load exits before proceeding. */ + uds_lock_mutex(&index_session->request_mutex); + while (index_session->state & IS_FLAG_LOADING) { + uds_wait_cond(&index_session->request_cond, + &index_session->request_mutex); + } + uds_unlock_mutex(&index_session->request_mutex); + } + + wait_for_no_requests_in_progress(index_session); + result = save_and_free_index(index_session); + uds_free_const(index_session->parameters.name); + uds_request_queue_finish(index_session->callback_queue); + index_session->callback_queue = NULL; + uds_destroy_cond(&index_session->load_context.cond); + uds_destroy_mutex(&index_session->load_context.mutex); + uds_destroy_cond(&index_session->request_cond); + uds_destroy_mutex(&index_session->request_mutex); + uds_log_debug("Destroyed index session"); + UDS_FREE(index_session); + return uds_map_to_system_error(result); +} + +/* Wait until all callbacks for index operations are complete. */ +int uds_flush_index_session(struct uds_index_session *index_session) +{ + wait_for_no_requests_in_progress(index_session); + wait_for_idle_index(index_session->index); + return UDS_SUCCESS; +} + +/* + * Return the most recent parameters used to open an index. The caller is + * responsible for freeing the returned structure. + */ +int uds_get_index_parameters(struct uds_index_session *index_session, + struct uds_parameters **parameters) +{ + int result; + const char *name = index_session->parameters.name; + + if (parameters == NULL) { + uds_log_error("received a NULL parameters pointer"); + return -EINVAL; + } + + if (name != NULL) { + char *name_copy = NULL; + size_t name_length = strlen(name) + 1; + struct uds_parameters *copy; + + result = UDS_ALLOCATE_EXTENDED(struct uds_parameters, + name_length, + char, + __func__, + ©); + if (result != UDS_SUCCESS) { + return uds_map_to_system_error(result); + } + + *copy = index_session->parameters; + name_copy = (char *) copy + sizeof(struct uds_parameters); + memcpy(name_copy, name, name_length); + copy->name = name_copy; + *parameters = copy; + return UDS_SUCCESS; + } + + result = UDS_ALLOCATE(1, struct uds_parameters, __func__, parameters); + if (result == UDS_SUCCESS) { + **parameters = index_session->parameters; + } + + return uds_map_to_system_error(result); +} + +/* Statistics collection is intended to be thread-safe. */ +static void collect_stats(const struct uds_index_session *index_session, + struct uds_index_stats *stats) +{ + const struct session_stats *session_stats = &index_session->stats; + + stats->current_time = + ktime_to_seconds(current_time_ns(CLOCK_REALTIME)); + stats->posts_found = READ_ONCE(session_stats->posts_found); + stats->in_memory_posts_found = + READ_ONCE(session_stats->posts_found_open_chapter); + stats->dense_posts_found = READ_ONCE(session_stats->posts_found_dense); + stats->sparse_posts_found = + READ_ONCE(session_stats->posts_found_sparse); + stats->posts_not_found = READ_ONCE(session_stats->posts_not_found); + stats->updates_found = READ_ONCE(session_stats->updates_found); + stats->updates_not_found = READ_ONCE(session_stats->updates_not_found); + stats->deletions_found = READ_ONCE(session_stats->deletions_found); + stats->deletions_not_found = + READ_ONCE(session_stats->deletions_not_found); + stats->queries_found = READ_ONCE(session_stats->queries_found); + stats->queries_not_found = READ_ONCE(session_stats->queries_not_found); + stats->requests = READ_ONCE(session_stats->requests); +} + +int uds_get_index_stats(struct uds_index_session *index_session, + struct uds_index_stats *stats) +{ + if (stats == NULL) { + uds_log_error("received a NULL index stats pointer"); + return -EINVAL; + } + + collect_stats(index_session, stats); + if (index_session->index != NULL) { + get_index_stats(index_session->index, stats); + } else { + stats->entries_indexed = 0; + stats->memory_used = 0; + stats->collisions = 0; + stats->entries_discarded = 0; + } + + return UDS_SUCCESS; +} diff --git a/vdo/index-session.h b/vdo/index-session.h new file mode 100644 index 00000000..cd314b2f --- /dev/null +++ b/vdo/index-session.h @@ -0,0 +1,74 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright Red Hat + */ + +#ifndef INDEX_SESSION_H +#define INDEX_SESSION_H + +#include + +#include "config.h" +#include "cpu.h" +#include "uds-threads.h" +#include "uds.h" + +struct __attribute__((aligned(CACHE_LINE_BYTES))) session_stats { + /* Post requests that found an entry */ + uint64_t posts_found; + /* Post requests found in the open chapter */ + uint64_t posts_found_open_chapter; + /* Post requests found in the dense index */ + uint64_t posts_found_dense; + /* Post requests found in the sparse index */ + uint64_t posts_found_sparse; + /* Post requests that did not find an entry */ + uint64_t posts_not_found; + /* Update requests that found an entry */ + uint64_t updates_found; + /* Update requests that did not find an entry */ + uint64_t updates_not_found; + /* Delete requests that found an entry */ + uint64_t deletions_found; + /* Delete requests that did not find an entry */ + uint64_t deletions_not_found; + /* Query requests that found an entry */ + uint64_t queries_found; + /* Query requests that did not find an entry */ + uint64_t queries_not_found; + /* Total number of requests */ + uint64_t requests; +}; + +enum index_suspend_status { + /* An index load has started but the index is not ready for use. */ + INDEX_OPENING = 0, + /* The index is able to handle requests. */ + INDEX_READY, + /* The index is attempting to suspend a rebuild. */ + INDEX_SUSPENDING, + /* An index rebuild has been suspended. */ + INDEX_SUSPENDED, + /* An index rebuild is being stopped in order to shut down. */ + INDEX_FREEING, +}; + +struct index_load_context { + struct mutex mutex; + struct cond_var cond; + enum index_suspend_status status; +}; + +struct uds_index_session { + unsigned int state; + struct uds_index *index; + struct uds_request_queue *callback_queue; + struct uds_parameters parameters; + struct index_load_context load_context; + struct mutex request_mutex; + struct cond_var request_cond; + int request_count; + struct session_stats stats; +}; + +#endif /* INDEX_SESSION_H */ diff --git a/vdo/index.c b/vdo/index.c new file mode 100644 index 00000000..8f9de9ff --- /dev/null +++ b/vdo/index.c @@ -0,0 +1,1618 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright Red Hat + */ + + +#include "index.h" + +#include "hash-utils.h" +#include "logger.h" +#include "request-queue.h" +#include "sparse-cache.h" + +static const uint64_t NO_LAST_SAVE = UINT64_MAX; + +/* + * When searching for deduplication records, the index first searches the + * volume index, and then searches the chapter index for the relevant + * chapter. If the chapter has been fully committed to storage, the chapter + * pages are loaded into the page cache. If the chapter has not yet been + * committed (either the open chapter or a recently closed one), the index + * searches the in-memory representation of the chapter. Finally, if the volume + * index does not find a record and the index is sparse, the index will search + * the sparse cache. + * + * The index send two kinds of messages to coordinate between zones: chapter + * close messages for the chapter writer, and sparse cache barrier messages for + * the sparse cache. + * + * The chapter writer is responsible for committing chapters of records to + * storage. Since zones can get different numbers of records, some zones may + * fall behind others. Each time a zone fills up its available space in a + * chapter, it informs the chapter writer that the chapter is complete, and + * also informs all other zones that it has closed the chapter. Each other zone + * will then close the chapter immediately, regardless of how full it is, in + * order to minimize skew between zones. Once every zone has closed the + * chapter, the chapter writer will commit that chapter to storage. + * + * The last zone to close the chapter also removes the oldest chapter from the + * volume index. Although that chapter is invalid for zones that have moved on, + * the existence of the open chapter means that those zones will never ask the + * volume index about it. No zone is allowed to get more than one chapter + * ahead of any other. If a zone is so far ahead that it tries to close another + * chapter before the previous one has been closed by all zones, it is forced + * to wait. + * + * The sparse cache relies on having the same set of chapter indexes available + * to all zones. When a request wants to add a chapter to the sparse cache, it + * sends a barrier message to each zone during the triage stage that acts as a + * rendezvous. Once every zone has reached the barrier and paused its + * operations, the cache membership is changed and each zone is then informed + * that it can proceed. More details can be found in the sparse cache + * documentation. + * + * If a sparse cache has only one zone, it will not create a triage queue, but + * it still needs the barrier message to change the sparse cache membership, + * so the index simulates the message by invoking the handler directly. + */ + +struct chapter_writer { + /* The index to which we belong */ + struct uds_index *index; + /* The thread to do the writing */ + struct thread *thread; + /* The lock protecting the following fields */ + struct mutex mutex; + /* The condition signalled on state changes */ + struct cond_var cond; + /* Set to true to stop the thread */ + bool stop; + /* The result from the most recent write */ + int result; + /* The number of bytes allocated by the chapter writer */ + size_t memory_allocated; + /* The number of zones which have submitted a chapter for writing */ + unsigned int zones_to_write; + /* Open chapter index used by close_open_chapter() */ + struct open_chapter_index *open_chapter_index; + /* Collated records used by close_open_chapter() */ + struct uds_chunk_record *collated_records; + /* The chapters to write (one per zone) */ + struct open_chapter_zone *chapters[]; +}; + +static bool is_zone_chapter_sparse(const struct index_zone *zone, + uint64_t virtual_chapter) +{ + return is_chapter_sparse(zone->index->volume->geometry, + zone->oldest_virtual_chapter, + zone->newest_virtual_chapter, + virtual_chapter); +} + +static int launch_zone_message(struct uds_zone_message message, + unsigned int zone, + struct uds_index *index) +{ + int result; + struct uds_request *request; + + result = UDS_ALLOCATE(1, struct uds_request, __func__, &request); + if (result != UDS_SUCCESS) { + return result; + } + + request->index = index; + request->unbatched = true; + request->zone_number = zone; + request->zone_message = message; + + enqueue_request(request, STAGE_MESSAGE); + return UDS_SUCCESS; +} + +static void enqueue_barrier_messages(struct uds_index *index, + uint64_t virtual_chapter) +{ + struct uds_zone_message message = { + .type = UDS_MESSAGE_SPARSE_CACHE_BARRIER, + .virtual_chapter = virtual_chapter, + }; + unsigned int zone; + + for (zone = 0; zone < index->zone_count; zone++) { + int result = launch_zone_message(message, zone, index); + + ASSERT_LOG_ONLY((result == UDS_SUCCESS), + "barrier message allocation"); + } +} + +/* + * Determine whether this request should trigger a sparse cache barrier message + * to change the membership of the sparse cache. If a change in membership is + * desired, the function returns the chapter number to add. + */ +static uint64_t triage_index_request(struct uds_index *index, + struct uds_request *request) +{ + uint64_t virtual_chapter; + struct index_zone *zone; + + virtual_chapter = lookup_volume_index_name(index->volume_index, + &request->chunk_name); + if (virtual_chapter == UINT64_MAX) { + return UINT64_MAX; + } + + zone = index->zones[request->zone_number]; + if (!is_zone_chapter_sparse(zone, virtual_chapter)) { + return UINT64_MAX; + } + + /* + * FIXME: Optimize for a common case by remembering the chapter from + * the most recent barrier message and skipping this chapter if is it + * the same. + */ + + return virtual_chapter; +} + +/* + * Simulate a message to change the sparse cache membership for a single-zone + * sparse index. This allows us to forgo the complicated locking required by a + * multi-zone sparse index. Any other kind of index does nothing here. + */ +static int simulate_index_zone_barrier_message(struct index_zone *zone, + struct uds_request *request) +{ + uint64_t sparse_virtual_chapter; + + if ((zone->index->zone_count > 1) || + !is_sparse_geometry(zone->index->volume->geometry)) { + return UDS_SUCCESS; + } + + sparse_virtual_chapter = triage_index_request(zone->index, request); + if (sparse_virtual_chapter == UINT64_MAX) { + return UDS_SUCCESS; + } + + return update_sparse_cache(zone, sparse_virtual_chapter); +} + +/* This is the request processing function for the triage queue. */ +static void triage_request(struct uds_request *request) +{ + struct uds_index *index = request->index; + uint64_t sparse_virtual_chapter = triage_index_request(index, request); + + if (sparse_virtual_chapter != UINT64_MAX) { + enqueue_barrier_messages(index, sparse_virtual_chapter); + } + + enqueue_request(request, STAGE_INDEX); +} + +static int finish_previous_chapter(struct uds_index *index, + uint64_t current_chapter_number) +{ + int result; + struct chapter_writer *writer = index->chapter_writer; + + uds_lock_mutex(&writer->mutex); + while (index->newest_virtual_chapter < current_chapter_number) { + uds_wait_cond(&writer->cond, &writer->mutex); + } + result = writer->result; + uds_unlock_mutex(&writer->mutex); + + if (result != UDS_SUCCESS) { + return uds_log_error_strerror(result, + "Writing of previous open chapter failed"); + } + + return UDS_SUCCESS; +} + +static int swap_open_chapter(struct index_zone *zone) +{ + int result; + struct open_chapter_zone *temporary_chapter; + + result = finish_previous_chapter(zone->index, + zone->newest_virtual_chapter); + if (result != UDS_SUCCESS) { + return result; + } + + temporary_chapter = zone->open_chapter; + zone->open_chapter = zone->writing_chapter; + zone->writing_chapter = temporary_chapter; + return UDS_SUCCESS; +} + +static void reap_oldest_chapter(struct index_zone *zone) +{ + set_volume_index_zone_open_chapter(zone->index->volume_index, + zone->id, + zone->newest_virtual_chapter); +} + +/* + * Inform the chapter writer that this zone is done with this chapter. The + * chapter won't start writing until all zones have closed it. + */ +static unsigned int start_closing_chapter(struct uds_index *index, + unsigned int zone_number, + struct open_chapter_zone *chapter) +{ + unsigned int finished_zones; + struct chapter_writer *writer = index->chapter_writer; + + uds_lock_mutex(&writer->mutex); + finished_zones = ++writer->zones_to_write; + writer->chapters[zone_number] = chapter; + uds_broadcast_cond(&writer->cond); + uds_unlock_mutex(&writer->mutex); + + return finished_zones; +} + +static int announce_chapter_closed(struct index_zone *zone, + uint64_t closed_chapter) +{ + int result; + unsigned int i; + struct uds_zone_message zone_message = { + .type = UDS_MESSAGE_ANNOUNCE_CHAPTER_CLOSED, + .virtual_chapter = closed_chapter, + }; + + for (i = 0; i < zone->index->zone_count; i++) { + if (zone->id == i) { + continue; + } + + result = launch_zone_message(zone_message, i, zone->index); + if (result != UDS_SUCCESS) { + return result; + } + } + + return UDS_SUCCESS; +} + +static int open_next_chapter(struct index_zone *zone) +{ + int result; + uint64_t closed_chapter; + uint64_t expiring; + unsigned int finished_zones; + unsigned int expire_chapters; + + uds_log_debug("closing chapter %llu of zone %u after %u entries (%u short)", + (unsigned long long) zone->newest_virtual_chapter, + zone->id, + zone->open_chapter->size, + zone->open_chapter->capacity - zone->open_chapter->size); + + result = swap_open_chapter(zone); + if (result != UDS_SUCCESS) { + return result; + } + + closed_chapter = zone->newest_virtual_chapter++; + reap_oldest_chapter(zone); + reset_open_chapter(zone->open_chapter); + + finished_zones = start_closing_chapter(zone->index, + zone->id, + zone->writing_chapter); + if ((finished_zones == 1) && (zone->index->zone_count > 1)) { + result = announce_chapter_closed(zone, closed_chapter); + if (result != UDS_SUCCESS) { + return result; + } + } + + expiring = zone->oldest_virtual_chapter; + expire_chapters = chapters_to_expire(zone->index->volume->geometry, + zone->newest_virtual_chapter); + zone->oldest_virtual_chapter += expire_chapters; + + if (finished_zones < zone->index->zone_count) { + return UDS_SUCCESS; + } + + while ((expire_chapters-- > 0) && (result == UDS_SUCCESS)) { + result = forget_chapter(zone->index->volume, expiring++); + } + + return result; +} + +static int handle_chapter_closed(struct index_zone *zone, + uint64_t virtual_chapter) +{ + if (zone->newest_virtual_chapter == virtual_chapter) { + return open_next_chapter(zone); + } + + return UDS_SUCCESS; +} + +static int dispatch_index_zone_control_request(struct uds_request *request) +{ + struct uds_zone_message *message = &request->zone_message; + struct index_zone *zone = request->index->zones[request->zone_number]; + + switch (message->type) { + case UDS_MESSAGE_SPARSE_CACHE_BARRIER: + return update_sparse_cache(zone, message->virtual_chapter); + + case UDS_MESSAGE_ANNOUNCE_CHAPTER_CLOSED: + return handle_chapter_closed(zone, message->virtual_chapter); + + default: + uds_log_error("invalid message type: %d", message->type); + return UDS_INVALID_ARGUMENT; + } +} + +static void set_request_location(struct uds_request *request, + enum uds_index_region new_location) +{ + request->location = new_location; + request->found = ((new_location == UDS_LOCATION_IN_OPEN_CHAPTER) || + (new_location == UDS_LOCATION_IN_DENSE) || + (new_location == UDS_LOCATION_IN_SPARSE)); +} + +static void set_chapter_location(struct uds_request *request, + const struct index_zone *zone, + uint64_t virtual_chapter) +{ + request->found = true; + if (virtual_chapter == zone->newest_virtual_chapter) { + request->location = UDS_LOCATION_IN_OPEN_CHAPTER; + } else if (is_zone_chapter_sparse(zone, virtual_chapter)) { + request->location = UDS_LOCATION_IN_SPARSE; + } else { + request->location = UDS_LOCATION_IN_DENSE; + } +} + +static int search_sparse_cache_in_zone(struct index_zone *zone, + struct uds_request *request, + uint64_t virtual_chapter, + bool *found) +{ + int result; + struct volume *volume; + int record_page_number; + unsigned int chapter; + + result = search_sparse_cache(zone, + &request->chunk_name, + &virtual_chapter, + &record_page_number); + if ((result != UDS_SUCCESS) || (virtual_chapter == UINT64_MAX)) { + return result; + } + + request->virtual_chapter = virtual_chapter; + volume = zone->index->volume; + chapter = map_to_physical_chapter(volume->geometry, virtual_chapter); + return search_cached_record_page(volume, + request, + &request->chunk_name, + chapter, + record_page_number, + &request->old_metadata, + found); +} + +static int get_record_from_zone(struct index_zone *zone, + struct uds_request *request, + bool *found) +{ + struct volume *volume; + + if (request->location == UDS_LOCATION_RECORD_PAGE_LOOKUP) { + *found = true; + return UDS_SUCCESS; + } else if (request->location == UDS_LOCATION_UNAVAILABLE) { + *found = false; + return UDS_SUCCESS; + } + + if (request->virtual_chapter == zone->newest_virtual_chapter) { + search_open_chapter(zone->open_chapter, + &request->chunk_name, + &request->old_metadata, + found); + return UDS_SUCCESS; + } + + if ((zone->newest_virtual_chapter > 0) && + (request->virtual_chapter == (zone->newest_virtual_chapter - 1)) && + (zone->writing_chapter->size > 0)) { + search_open_chapter(zone->writing_chapter, + &request->chunk_name, + &request->old_metadata, + found); + return UDS_SUCCESS; + } + + volume = zone->index->volume; + if (is_zone_chapter_sparse(zone, request->virtual_chapter) && + sparse_cache_contains(volume->sparse_cache, + request->virtual_chapter, + request->zone_number)) { + return search_sparse_cache_in_zone(zone, + request, + request->virtual_chapter, + found); + } + + return search_volume_page_cache(volume, + request, + &request->chunk_name, + request->virtual_chapter, + &request->old_metadata, + found); +} + +static int put_record_in_zone(struct index_zone *zone, + struct uds_request *request, + const struct uds_chunk_data *metadata) +{ + int result; + unsigned int remaining; + + result = put_open_chapter(zone->open_chapter, + &request->chunk_name, + metadata, + &remaining); + if (result != UDS_SUCCESS) { + return result; + } + + if (remaining == 0) { + return open_next_chapter(zone); + } + + return UDS_SUCCESS; +} + +static int search_index_zone(struct index_zone *zone, + struct uds_request *request) +{ + int result; + struct volume_index_record record; + bool overflow_record, found = false; + struct uds_chunk_data *metadata; + uint64_t chapter; + + result = get_volume_index_record(zone->index->volume_index, + &request->chunk_name, + &record); + if (result != UDS_SUCCESS) { + return result; + } + + if (record.is_found) { + if (request->requeued && + request->virtual_chapter != record.virtual_chapter) { + set_request_location(request, UDS_LOCATION_UNKNOWN); + } + + request->virtual_chapter = record.virtual_chapter; + result = get_record_from_zone(zone, request, &found); + if (result != UDS_SUCCESS) { + return result; + } + } + + if (found) { + set_chapter_location(request, zone, record.virtual_chapter); + } + + /* + * If a record has overflowed a chapter index in more than one chapter + * (or overflowed in one chapter and collided with an existing record), + * it will exist as a collision record in the volume index, but + * we won't find it in the volume. This case needs special handling. + */ + overflow_record = (record.is_found && record.is_collision && !found); + chapter = zone->newest_virtual_chapter; + if (found || overflow_record) { + if ((request->type == UDS_QUERY_NO_UPDATE) || + ((request->type == UDS_QUERY) && overflow_record)) { + /* There is nothing left to do. */ + return UDS_SUCCESS; + } + + if (record.virtual_chapter != chapter) { + /* + * Update the volume index to reference the new chapter + * for the block. If the record had been deleted or + * dropped from the chapter index, it will be back. + */ + result = set_volume_index_record_chapter(&record, + chapter); + } else if (request->type != UDS_UPDATE) { + /* The record is already in the open chapter. */ + return UDS_SUCCESS; + } + } else { + /* + * The record wasn't in the volume index, so check whether the + * name is in a cached sparse chapter. If we found the name on + * a previous search, use that result instead. + */ + if (request->location == UDS_LOCATION_RECORD_PAGE_LOOKUP) { + found = true; + } else if (request->location == UDS_LOCATION_UNAVAILABLE) { + found = false; + } else if (is_sparse_geometry(zone->index->volume->geometry) && + !is_volume_index_sample(zone->index->volume_index, + &request->chunk_name)) { + result = search_sparse_cache_in_zone(zone, + request, + UINT64_MAX, + &found); + if (result != UDS_SUCCESS) { + return result; + } + } + + if (found) { + set_request_location(request, UDS_LOCATION_IN_SPARSE); + } + + if ((request->type == UDS_QUERY_NO_UPDATE) || + ((request->type == UDS_QUERY) && !found)) { + /* There is nothing left to do. */ + return UDS_SUCCESS; + } + + /* + * Add a new entry to the volume index referencing the open + * chapter. This needs to be done both for new records, and for + * records from cached sparse chapters. + */ + result = put_volume_index_record(&record, chapter); + } + + if (result == UDS_OVERFLOW) { + /* + * The volume index encountered a delta list overflow. The + * condition was already logged. We will go on without adding + * the chunk to the open chapter. + */ + return UDS_SUCCESS; + } + + if (result != UDS_SUCCESS) { + return result; + } + + if (!found || (request->type == UDS_UPDATE)) { + /* This is a new record or we're updating an existing record. */ + metadata = &request->new_metadata; + } else { + /* Move the existing record to the open chapter. */ + metadata = &request->old_metadata; + } + return put_record_in_zone(zone, request, metadata); +} + +static int remove_from_index_zone(struct index_zone *zone, + struct uds_request *request) +{ + int result; + struct volume_index_record record; + + result = get_volume_index_record(zone->index->volume_index, + &request->chunk_name, + &record); + if (result != UDS_SUCCESS) { + return result; + } + + if (!record.is_found) { + return UDS_SUCCESS; + } + + /* + * If the request was requeued, check whether the saved state is still + * valid. + */ + + if (record.is_collision) { + set_chapter_location(request, zone, record.virtual_chapter); + } else { + /* + * Non-collision records are hints, so resolve the name in the + * chapter. + */ + bool found; + + if (request->requeued && + request->virtual_chapter != record.virtual_chapter) { + set_request_location(request, UDS_LOCATION_UNKNOWN); + } + + request->virtual_chapter = record.virtual_chapter; + result = get_record_from_zone(zone, request, &found); + if (result != UDS_SUCCESS) { + return result; + } + + if (!found) { + /* There is no record to remove. */ + return UDS_SUCCESS; + } + } + + set_chapter_location(request, zone, record.virtual_chapter); + + /* + * Delete the volume index entry for the named record only. Note that a + * later search might later return stale advice if there is a colliding + * name in the same chapter, but it's a very rare case (1 in 2^21). + */ + result = remove_volume_index_record(&record); + if (result != UDS_SUCCESS) { + return result; + } + + /* + * If the record is in the open chapter, we must remove it or mark it + * deleted to avoid trouble if the record is added again later. + */ + if (request->location == UDS_LOCATION_IN_OPEN_CHAPTER) { + remove_from_open_chapter(zone->open_chapter, + &request->chunk_name); + } + + return UDS_SUCCESS; +} + +static int dispatch_index_request(struct uds_index *index, + struct uds_request *request) +{ + int result; + struct index_zone *zone = index->zones[request->zone_number]; + + if (!request->requeued) { + result = simulate_index_zone_barrier_message(zone, request); + if (result != UDS_SUCCESS) { + return result; + } + } + + switch (request->type) { + case UDS_POST: + case UDS_UPDATE: + case UDS_QUERY: + case UDS_QUERY_NO_UPDATE: + result = search_index_zone(zone, request); + break; + + case UDS_DELETE: + result = remove_from_index_zone(zone, request); + break; + + default: + result = uds_log_warning_strerror(UDS_INVALID_ARGUMENT, + "invalid request type: %d", + request->type); + break; + } + + return result; +} + +/* This is the request processing function invoked by each zone's thread. */ +static void execute_zone_request(struct uds_request *request) +{ + int result; + struct uds_index *index = request->index; + + if (request->zone_message.type != UDS_MESSAGE_NONE) { + result = dispatch_index_zone_control_request(request); + if (result != UDS_SUCCESS) { + uds_log_error_strerror(result, + "error executing message: %d", + request->zone_message.type); + } + + /* Once the message is processed it can be freed. */ + UDS_FREE(UDS_FORGET(request)); + return; + } + + index->need_to_save = true; + if (request->requeued && (request->status != UDS_SUCCESS)) { + set_request_location(request, UDS_LOCATION_UNAVAILABLE); + index->callback(request); + return; + } + + result = dispatch_index_request(index, request); + if (result == UDS_QUEUED) { + /* The request has been requeued so don't let it complete. */ + return; + } + + if (!request->found) { + set_request_location(request, UDS_LOCATION_UNAVAILABLE); + } + + request->status = result; + index->callback(request); +} + +static int initialize_index_queues(struct uds_index *index, + const struct geometry *geometry) +{ + int result; + unsigned int i; + + for (i = 0; i < index->zone_count; i++) { + result = make_uds_request_queue("indexW", + &execute_zone_request, + &index->zone_queues[i]); + if (result != UDS_SUCCESS) { + return result; + } + } + + /* The triage queue is only needed for sparse multi-zone indexes. */ + if ((index->zone_count > 1) && is_sparse_geometry(geometry)) { + result = make_uds_request_queue("triageW", + &triage_request, + &index->triage_queue); + if (result != UDS_SUCCESS) { + return result; + } + } + + return UDS_SUCCESS; +} + +/* This is the driver function for the chapter writer thread. */ +static void close_chapters(void *arg) +{ + int result; + struct chapter_writer *writer = arg; + struct uds_index *index = writer->index; + + uds_log_debug("chapter writer starting"); + uds_lock_mutex(&writer->mutex); + for (;;) { + while (writer->zones_to_write < index->zone_count) { + if (writer->stop && (writer->zones_to_write == 0)) { + /* + * We've been told to stop, and all of the + * zones are in the same open chapter, so we + * can exit now. + */ + uds_unlock_mutex(&writer->mutex); + uds_log_debug("chapter writer stopping"); + return; + } + uds_wait_cond(&writer->cond, &writer->mutex); + } + + /* + * Release the lock while closing a chapter. We probably don't + * need to do this, but it seems safer in principle. It's OK to + * access the chapter and chapter_number fields without the + * lock since those aren't allowed to change until we're done. + */ + uds_unlock_mutex(&writer->mutex); + + if (index->has_saved_open_chapter) { + /* + * Remove the saved open chapter the first time we + * close an open chapter after loading from a clean + * shutdown, or after doing a clean save. The lack of + * the saved open chapter will indicate that a recovery + * is necessary. + */ + index->has_saved_open_chapter = false; + result = discard_open_chapter(index->layout); + if (result == UDS_SUCCESS) { + uds_log_debug("Discarding saved open chapter"); + } + } + + result = close_open_chapter(writer->chapters, + index->zone_count, + index->volume, + writer->open_chapter_index, + writer->collated_records, + index->newest_virtual_chapter); + + uds_lock_mutex(&writer->mutex); + index->newest_virtual_chapter++; + index->oldest_virtual_chapter += + chapters_to_expire(index->volume->geometry, + index->newest_virtual_chapter); + writer->result = result; + writer->zones_to_write = 0; + uds_broadcast_cond(&writer->cond); + } +} + +static void stop_chapter_writer(struct chapter_writer *writer) +{ + struct thread *writer_thread = 0; + + uds_lock_mutex(&writer->mutex); + if (writer->thread != 0) { + writer_thread = writer->thread; + writer->thread = 0; + writer->stop = true; + uds_broadcast_cond(&writer->cond); + } + uds_unlock_mutex(&writer->mutex); + + if (writer_thread != 0) { + uds_join_threads(writer_thread); + } +} + +static void free_chapter_writer(struct chapter_writer *writer) +{ + if (writer == NULL) { + return; + } + + stop_chapter_writer(writer); + uds_destroy_mutex(&writer->mutex); + uds_destroy_cond(&writer->cond); + free_open_chapter_index(writer->open_chapter_index); + UDS_FREE(writer->collated_records); + UDS_FREE(writer); +} + +static int make_chapter_writer(struct uds_index *index, + struct chapter_writer **writer_ptr) +{ + int result; + struct chapter_writer *writer; + size_t collated_records_size = + (sizeof(struct uds_chunk_record) * + (1 + index->volume->geometry->records_per_chapter)); + + result = UDS_ALLOCATE_EXTENDED(struct chapter_writer, + index->zone_count, + struct open_chapter_zone *, + "Chapter Writer", + &writer); + if (result != UDS_SUCCESS) { + return result; + } + + writer->index = index; + result = uds_init_mutex(&writer->mutex); + if (result != UDS_SUCCESS) { + UDS_FREE(writer); + return result; + } + + result = uds_init_cond(&writer->cond); + if (result != UDS_SUCCESS) { + uds_destroy_mutex(&writer->mutex); + UDS_FREE(writer); + return result; + } + + result = uds_allocate_cache_aligned(collated_records_size, + "collated records", + &writer->collated_records); + if (result != UDS_SUCCESS) { + free_chapter_writer(writer); + return result; + } + + result = make_open_chapter_index(&writer->open_chapter_index, + index->volume->geometry, + index->volume->nonce); + if (result != UDS_SUCCESS) { + free_chapter_writer(writer); + return result; + } + + writer->memory_allocated = + (sizeof(struct chapter_writer) + + index->zone_count * sizeof(struct open_chapter_zone *) + + collated_records_size + + writer->open_chapter_index->memory_allocated); + + result = uds_create_thread(close_chapters, writer, "writer", + &writer->thread); + if (result != UDS_SUCCESS) { + free_chapter_writer(writer); + return result; + } + + *writer_ptr = writer; + return UDS_SUCCESS; +} + +static int load_index(struct uds_index *index) +{ + int result; + uint64_t last_save_chapter; + + result = load_index_state(index->layout, index); + if (result != UDS_SUCCESS) { + return UDS_INDEX_NOT_SAVED_CLEANLY; + } + + last_save_chapter = + ((index->last_save != NO_LAST_SAVE) ? index->last_save : 0); + + uds_log_info("loaded index from chapter %llu through chapter %llu", + (unsigned long long) index->oldest_virtual_chapter, + (unsigned long long) last_save_chapter); + + return UDS_SUCCESS; +} + +static int rebuild_index_page_map(struct uds_index *index, uint64_t vcn) +{ + int result; + struct delta_index_page *chapter_index_page; + struct geometry *geometry = index->volume->geometry; + unsigned int chapter = map_to_physical_chapter(geometry, vcn); + unsigned int expected_list_number = 0; + unsigned int index_page_number; + unsigned int lowest_delta_list; + unsigned int highest_delta_list; + + for (index_page_number = 0; + index_page_number < geometry->index_pages_per_chapter; + index_page_number++) { + result = get_volume_page(index->volume, + chapter, + index_page_number, + NULL, + &chapter_index_page); + if (result != UDS_SUCCESS) { + return uds_log_error_strerror(result, + "failed to read index page %u in chapter %u", + index_page_number, + chapter); + } + + lowest_delta_list = chapter_index_page->lowest_list_number; + highest_delta_list = chapter_index_page->highest_list_number; + if (lowest_delta_list != expected_list_number) { + return uds_log_error_strerror(UDS_CORRUPT_DATA, + "chapter %u index page %u is corrupt", + chapter, + index_page_number); + } + + update_index_page_map(index->volume->index_page_map, + vcn, + chapter, + index_page_number, + highest_delta_list); + expected_list_number = highest_delta_list + 1; + } + + return UDS_SUCCESS; +} + +static int replay_record(struct uds_index *index, + const struct uds_chunk_name *name, + uint64_t virtual_chapter, + bool will_be_sparse_chapter) +{ + int result; + struct volume_index_record record; + bool update_record; + + if (will_be_sparse_chapter && + !is_volume_index_sample(index->volume_index, name)) { + /* + * This entry will be in a sparse chapter after the rebuild + * completes, and it is not a sample, so just skip over it. + */ + return UDS_SUCCESS; + } + + result = get_volume_index_record(index->volume_index, name, &record); + if (result != UDS_SUCCESS) { + return result; + } + + if (record.is_found) { + if (record.is_collision) { + if (record.virtual_chapter == virtual_chapter) { + /* The record is already correct. */ + return UDS_SUCCESS; + } + + update_record = true; + } else if (record.virtual_chapter == virtual_chapter) { + /* + * There is a volume index entry pointing to the + * current chapter, but we don't know if it is for the + * same name as the one we are currently working on or + * not. For now, we're just going to assume that it + * isn't. This will create one extra collision record + * if there was a deleted record in the current + * chapter. + */ + update_record = false; + } else { + /* + * If we're rebuilding, we don't normally want to go to + * disk to see if the record exists, since we will + * likely have just read the record from disk (i.e. we + * know it's there). The exception to this is when we + * find an entry in the volume index that has a + * different chapter. In this case, we need to search + * that chapter to determine if the volume index entry + * was for the same record or a different one. + */ + result = search_volume_page_cache(index->volume, + NULL, + name, + record.virtual_chapter, + NULL, + &update_record); + if (result != UDS_SUCCESS) { + return result; + } + } + } else { + update_record = false; + } + + if (update_record) { + /* + * Update the volume index to reference the new chapter for the + * block. If the record had been deleted or dropped from the + * chapter index, it will be back. + */ + result = set_volume_index_record_chapter(&record, + virtual_chapter); + } else { + /* + * Add a new entry to the volume index referencing the open + * chapter. This should be done regardless of whether we are a + * brand new record or a sparse record, i.e. one that doesn't + * exist in the index but does on disk, since for a sparse + * record, we would want to un-sparsify if it did exist. + */ + result = put_volume_index_record(&record, virtual_chapter); + } + + if ((result == UDS_DUPLICATE_NAME) || (result == UDS_OVERFLOW)) { + /* The rebuilt index will lose these records. */ + return UDS_SUCCESS; + } + + return result; +} + +static bool check_for_suspend(struct uds_index *index) +{ + bool closing; + + if (index->load_context == NULL) { + return false; + } + + uds_lock_mutex(&index->load_context->mutex); + if (index->load_context->status != INDEX_SUSPENDING) { + uds_unlock_mutex(&index->load_context->mutex); + return false; + } + + /* Notify that we are suspended and wait for the resume. */ + index->load_context->status = INDEX_SUSPENDED; + uds_broadcast_cond(&index->load_context->cond); + + while ((index->load_context->status != INDEX_OPENING) && + (index->load_context->status != INDEX_FREEING)) { + uds_wait_cond(&index->load_context->cond, + &index->load_context->mutex); + } + + closing = (index->load_context->status == INDEX_FREEING); + uds_unlock_mutex(&index->load_context->mutex); + return closing; +} + +static int +replay_chapter(struct uds_index *index, uint64_t virtual, bool sparse) +{ + int result; + unsigned int i; + unsigned int j; + const struct geometry *geometry; + unsigned int physical_chapter; + unsigned int first_page; + + if (check_for_suspend(index)) { + uds_log_info("Replay interrupted by index shutdown at chapter %llu", + (unsigned long long) virtual); + return -EBUSY; + } + + geometry = index->volume->geometry; + physical_chapter = map_to_physical_chapter(geometry, virtual); + first_page = map_to_physical_page(geometry, physical_chapter, 0); + prefetch_volume_pages(&index->volume->volume_store, + first_page, + geometry->pages_per_chapter); + set_volume_index_open_chapter(index->volume_index, virtual); + + result = rebuild_index_page_map(index, virtual); + if (result != UDS_SUCCESS) { + return uds_log_error_strerror(result, + "could not rebuild index page map for chapter %u", + physical_chapter); + } + + for (i = 0; i < geometry->record_pages_per_chapter; i++) { + byte *record_page; + unsigned int record_page_number; + + record_page_number = geometry->index_pages_per_chapter + i; + result = get_volume_page(index->volume, + physical_chapter, + record_page_number, + &record_page, + NULL); + if (result != UDS_SUCCESS) { + return uds_log_error_strerror(result, + "could not get page %d", + record_page_number); + } + + for (j = 0; j < geometry->records_per_page; j++) { + const byte *name_bytes; + struct uds_chunk_name name; + + name_bytes = record_page + (j * BYTES_PER_RECORD); + memcpy(&name.name, name_bytes, UDS_CHUNK_NAME_SIZE); + result = replay_record(index, &name, virtual, sparse); + if (result != UDS_SUCCESS) { + return result; + } + } + } + + return UDS_SUCCESS; +} + +static int replay_volume(struct uds_index *index) +{ + int result; + uint64_t old_map_update; + uint64_t new_map_update; + uint64_t virtual; + uint64_t from_virtual = index->oldest_virtual_chapter; + uint64_t upto_virtual = index->newest_virtual_chapter; + bool will_be_sparse; + + uds_log_info("Replaying volume from chapter %llu through chapter %llu", + (unsigned long long) from_virtual, + (unsigned long long) upto_virtual); + + /* + * The index failed to load, so the volume index is empty. Add records + * to the volume index in order, skipping non-hooks in chapters which + * will be sparse to save time. + * + * Go through each record page of each chapter and add the records back + * to the volume index. This should not cause anything to be written to + * either the open chapter or the on-disk volume. Also skip the on-disk + * chapter corresponding to upto_virtual, as this would have already + * been purged from the volume index when the chapter was opened. + * + * Also, go through each index page for each chapter and rebuild the + * index page map. + */ + old_map_update = index->volume->index_page_map->last_update; + for (virtual = from_virtual; virtual < upto_virtual; ++virtual) { + will_be_sparse = is_chapter_sparse(index->volume->geometry, + from_virtual, + upto_virtual, + virtual); + result = replay_chapter(index, virtual, will_be_sparse); + if (result != UDS_SUCCESS) { + return result; + } + } + + /* Also reap the chapter being replaced by the open chapter. */ + set_volume_index_open_chapter(index->volume_index, upto_virtual); + + new_map_update = index->volume->index_page_map->last_update; + if (new_map_update != old_map_update) { + uds_log_info("replay changed index page map update from %llu to %llu", + (unsigned long long) old_map_update, + (unsigned long long) new_map_update); + } + + return UDS_SUCCESS; +} + +static int rebuild_index(struct uds_index *index) +{ + int result; + uint64_t lowest; + uint64_t highest; + bool is_empty = false; + unsigned int chapters_per_volume = + index->volume->geometry->chapters_per_volume; + + index->volume->lookup_mode = LOOKUP_FOR_REBUILD; + result = find_volume_chapter_boundaries(index->volume, + &lowest, + &highest, + &is_empty); + if (result != UDS_SUCCESS) { + return uds_log_fatal_strerror(result, + "cannot rebuild index: unknown volume chapter boundaries"); + } + + if (lowest > highest) { + uds_log_fatal("cannot rebuild index: no valid chapters exist"); + return UDS_CORRUPT_DATA; + } + + if (is_empty) { + index->newest_virtual_chapter = 0; + index->oldest_virtual_chapter = 0; + set_volume_index_open_chapter(index->volume_index, 0); + index->volume->lookup_mode = LOOKUP_NORMAL; + return UDS_SUCCESS; + } + + if ((highest - lowest) >= chapters_per_volume) { + return uds_log_fatal_strerror(UDS_CORRUPT_DATA, + "cannot rebuild index: volume chapter boundaries too large"); + } + + index->newest_virtual_chapter = highest + 1; + index->oldest_virtual_chapter = lowest; + if (index->newest_virtual_chapter == + (index->oldest_virtual_chapter + chapters_per_volume)) { + /* Skip the chapter shadowed by the open chapter. */ + index->oldest_virtual_chapter++; + } + + result = replay_volume(index); + if (result != UDS_SUCCESS) { + return result; + } + + index->volume->lookup_mode = LOOKUP_NORMAL; + return UDS_SUCCESS; +} + +static void free_index_zone(struct index_zone *zone) +{ + if (zone == NULL) { + return; + } + + free_open_chapter(zone->open_chapter); + free_open_chapter(zone->writing_chapter); + UDS_FREE(zone); +} + +static int make_index_zone(struct uds_index *index, unsigned int zone_number) +{ + int result; + struct index_zone *zone; + + result = UDS_ALLOCATE(1, struct index_zone, "index zone", &zone); + if (result != UDS_SUCCESS) { + return result; + } + + result = make_open_chapter(index->volume->geometry, + index->zone_count, + &zone->open_chapter); + if (result != UDS_SUCCESS) { + free_index_zone(zone); + return result; + } + + result = make_open_chapter(index->volume->geometry, + index->zone_count, + &zone->writing_chapter); + if (result != UDS_SUCCESS) { + free_index_zone(zone); + return result; + } + + zone->index = index; + zone->id = zone_number; + index->zones[zone_number] = zone; + + return UDS_SUCCESS; +} + +int make_index(struct configuration *config, + enum uds_open_index_type open_type, + struct index_load_context *load_context, + index_callback_t callback, + struct uds_index **new_index) +{ + int result; + bool loaded = false; + bool new = (open_type == UDS_CREATE); + struct uds_index *index = NULL; + struct index_zone *zone; + uint64_t nonce; + unsigned int z; + + result = UDS_ALLOCATE_EXTENDED(struct uds_index, + config->zone_count, + struct uds_request_queue *, + "index", + &index); + if (result != UDS_SUCCESS) { + return result; + } + + index->zone_count = config->zone_count; + + result = make_uds_index_layout(config, new, &index->layout); + if (result != UDS_SUCCESS) { + free_index(index); + return result; + } + + result = UDS_ALLOCATE(index->zone_count, + struct index_zone *, + "zones", + &index->zones); + if (result != UDS_SUCCESS) { + free_index(index); + return result; + } + + result = make_volume(config, index->layout, &index->volume); + if (result != UDS_SUCCESS) { + free_index(index); + return result; + } + + index->volume->lookup_mode = LOOKUP_NORMAL; + for (z = 0; z < index->zone_count; z++) { + result = make_index_zone(index, z); + if (result != UDS_SUCCESS) { + free_index(index); + return uds_log_error_strerror(result, + "Could not create index zone"); + } + } + + nonce = get_uds_volume_nonce(index->layout); + result = make_volume_index(config, nonce, &index->volume_index); + if (result != UDS_SUCCESS) { + free_index(index); + return uds_log_error_strerror(result, + "could not make volume index"); + } + + index->load_context = load_context; + index->callback = callback; + + result = initialize_index_queues(index, config->geometry); + if (result != UDS_SUCCESS) { + free_index(index); + return result; + } + + result = make_chapter_writer(index, &index->chapter_writer); + if (result != UDS_SUCCESS) { + free_index(index); + return result; + } + + if (new) { + discard_index_state_data(index->layout); + } else { + result = load_index(index); + switch (result) { + case UDS_SUCCESS: + loaded = true; + break; + case -ENOMEM: + /* We should not try a rebuild for this error. */ + uds_log_error_strerror(result, + "index could not be loaded"); + break; + default: + uds_log_error_strerror(result, + "index could not be loaded"); + if (open_type == UDS_LOAD) { + result = rebuild_index(index); + if (result != UDS_SUCCESS) { + uds_log_error_strerror(result, + "index could not be rebuilt"); + } + } + break; + } + } + + if (result != UDS_SUCCESS) { + free_index(index); + return uds_log_error_strerror(result, + "fatal error in make_index"); + } + + for (z = 0; z < index->zone_count; z++) { + zone = index->zones[z]; + zone->oldest_virtual_chapter = index->oldest_virtual_chapter; + zone->newest_virtual_chapter = index->newest_virtual_chapter; + } + + if (index->load_context != NULL) { + uds_lock_mutex(&index->load_context->mutex); + index->load_context->status = INDEX_READY; + /* + * If we get here, suspend is meaningless, but notify any + * thread trying to suspend us so it doesn't hang. + */ + uds_broadcast_cond(&index->load_context->cond); + uds_unlock_mutex(&index->load_context->mutex); + } + + index->has_saved_open_chapter = loaded; + index->need_to_save = !loaded; + *new_index = index; + return UDS_SUCCESS; +} + +void free_index(struct uds_index *index) +{ + unsigned int i; + + if (index == NULL) { + return; + } + + uds_request_queue_finish(index->triage_queue); + for (i = 0; i < index->zone_count; i++) { + uds_request_queue_finish(index->zone_queues[i]); + } + + free_chapter_writer(index->chapter_writer); + + free_volume_index(index->volume_index); + if (index->zones != NULL) { + for (i = 0; i < index->zone_count; i++) { + free_index_zone(index->zones[i]); + } + UDS_FREE(index->zones); + } + + free_volume(index->volume); + free_uds_index_layout(UDS_FORGET(index->layout)); + UDS_FREE(index); +} + +/* Wait for the chapter writer to complete any outstanding writes. */ +void wait_for_idle_index(struct uds_index *index) +{ + struct chapter_writer *writer = index->chapter_writer; + + uds_lock_mutex(&writer->mutex); + while (writer->zones_to_write > 0) { + uds_wait_cond(&writer->cond, &writer->mutex); + } + uds_unlock_mutex(&writer->mutex); +} + +/* This function assumes that all requests have been drained. */ +int save_index(struct uds_index *index) +{ + int result; + + if (!index->need_to_save) { + return UDS_SUCCESS; + } + + wait_for_idle_index(index); + index->prev_save = index->last_save; + index->last_save = ((index->newest_virtual_chapter == 0) ? + NO_LAST_SAVE : + index->newest_virtual_chapter - 1); + uds_log_info("beginning save (vcn %llu)", + (unsigned long long) index->last_save); + + result = save_index_state(index->layout, index); + if (result != UDS_SUCCESS) { + uds_log_info("save index failed"); + index->last_save = index->prev_save; + } else { + index->has_saved_open_chapter = true; + index->need_to_save = false; + uds_log_info("finished save (vcn %llu)", + (unsigned long long) index->last_save); + } + + return result; +} + +int replace_index_storage(struct uds_index *index, const char *path) +{ + return replace_volume_storage(index->volume, index->layout, path); +} + +/* Accessing statistics should be safe from any thread. */ +void get_index_stats(struct uds_index *index, struct uds_index_stats *counters) +{ + struct volume_index_stats dense_stats; + struct volume_index_stats sparse_stats; + + get_volume_index_stats(index->volume_index, + &dense_stats, + &sparse_stats); + + counters->entries_indexed = + dense_stats.record_count + sparse_stats.record_count; + counters->memory_used = + ((uint64_t) dense_stats.memory_allocated + + (uint64_t) sparse_stats.memory_allocated + + (uint64_t) get_cache_size(index->volume) + + index->chapter_writer->memory_allocated); + counters->collisions = + (dense_stats.collision_count + sparse_stats.collision_count); + counters->entries_discarded = + (dense_stats.discard_count + sparse_stats.discard_count); +} + +void enqueue_request(struct uds_request *request, enum request_stage stage) +{ + struct uds_index *index = request->index; + struct uds_request_queue *queue; + + switch (stage) { + case STAGE_TRIAGE: + if (index->triage_queue != NULL) { + queue = index->triage_queue; + break; + } + + fallthrough; + + case STAGE_INDEX: + request->zone_number = + get_volume_index_zone(index->volume_index, + &request->chunk_name); + fallthrough; + + case STAGE_MESSAGE: + queue = index->zone_queues[request->zone_number]; + break; + + default: + ASSERT_LOG_ONLY(false, "invalid index stage: %d", stage); + return; + } + + uds_request_queue_enqueue(queue, request); +} diff --git a/vdo/index.h b/vdo/index.h new file mode 100644 index 00000000..3f199acd --- /dev/null +++ b/vdo/index.h @@ -0,0 +1,75 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright Red Hat + */ + +#ifndef INDEX_H +#define INDEX_H + +#include "index-layout.h" +#include "index-session.h" +#include "open-chapter.h" +#include "volume.h" +#include "volume-index-ops.h" + + +typedef void (*index_callback_t)(struct uds_request *request); + +struct index_zone { + struct uds_index *index; + struct open_chapter_zone *open_chapter; + struct open_chapter_zone *writing_chapter; + uint64_t oldest_virtual_chapter; + uint64_t newest_virtual_chapter; + unsigned int id; +}; + +struct uds_index { + bool has_saved_open_chapter; + bool need_to_save; + struct index_load_context *load_context; + struct index_layout *layout; + struct volume_index *volume_index; + struct volume *volume; + unsigned int zone_count; + struct index_zone **zones; + + uint64_t oldest_virtual_chapter; + uint64_t newest_virtual_chapter; + + uint64_t last_save; + uint64_t prev_save; + struct chapter_writer *chapter_writer; + + index_callback_t callback; + struct uds_request_queue *triage_queue; + struct uds_request_queue *zone_queues[]; +}; + +enum request_stage { + STAGE_TRIAGE, + STAGE_INDEX, + STAGE_MESSAGE, +}; + +int __must_check make_index(struct configuration *config, + enum uds_open_index_type open_type, + struct index_load_context *load_context, + index_callback_t callback, + struct uds_index **new_index); + +int __must_check save_index(struct uds_index *index); + +void free_index(struct uds_index *index); + +int __must_check replace_index_storage(struct uds_index *index, + const char *path); + +void get_index_stats(struct uds_index *index, + struct uds_index_stats *counters); + +void enqueue_request(struct uds_request *request, enum request_stage stage); + +void wait_for_idle_index(struct uds_index *index); + +#endif /* INDEX_H */ diff --git a/vdo/instanceNumber.c b/vdo/instance-number.c similarity index 54% rename from vdo/instanceNumber.c rename to vdo/instance-number.c index 13b01994..03c77edf 100644 --- a/vdo/instanceNumber.c +++ b/vdo/instance-number.c @@ -1,31 +1,15 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/instanceNumber.c#10 $ */ -#include "instanceNumber.h" +#include "instance-number.h" #include #include -#include "memoryAlloc.h" -#include "numUtils.h" +#include "memory-alloc.h" +#include "num-utils.h" #include "permassert.h" /* @@ -56,27 +40,29 @@ static unsigned int instance_count; static unsigned int next_instance; /** - * Return the number of bytes needed to store a bit array of the specified - * capacity in an array of unsigned longs. + * get_bit_array_size() - Return the number of bytes needed to store a + * bit array of the specified capacity in an + * array of unsigned longs. + * @bit_count: The number of bits the array must hold. * - * @param bit_count The number of bits the array must hold - * - * @return the number of bytes needed for the array reperesentation - **/ + * Return: the number of bytes needed for the array reperesentation. + */ static size_t get_bit_array_size(unsigned int bit_count) { - // Round up to a multiple of the word size and convert to a byte count. - return (compute_bucket_count(bit_count, BITS_PER_LONG) * + /* Round up to a multiple of the word size and convert to a byte count. */ + return (DIV_ROUND_UP(bit_count, BITS_PER_LONG) * sizeof(unsigned long)); } /** - * Re-allocate the bitmap word array so there will more instance numbers that - * can be allocated. Since the array is initially NULL, this also initializes - * the array the first time we allocate an instance number. + * grow_bit_array() - Re-allocate the bitmap word array so there will + * more instance numbers that can be allocated. * - * @return UDS_SUCCESS or an error code from the allocation - **/ + * Since the array is initially NULL, this also initializes the array + * the first time we allocate an instance number. + * + * Return: UDS_SUCCESS or an error code from the allocation + */ static int grow_bit_array(void) { unsigned int new_count = max(bit_count + BIT_COUNT_INCREMENT, @@ -96,11 +82,10 @@ static int grow_bit_array(void) return UDS_SUCCESS; } -/**********************************************************************/ -static int allocate_vdo_instance_locked(unsigned int *instance_ptr) +static int vdo_allocate_instance_locked(unsigned int *instance_ptr) { unsigned int instance; - // If there are no unallocated instances, grow the bit array. + /* If there are no unallocated instances, grow the bit array. */ if (instance_count >= bit_count) { int result = grow_bit_array(); @@ -109,13 +94,17 @@ static int allocate_vdo_instance_locked(unsigned int *instance_ptr) } } - // There must be a zero bit somewhere now. Find it, starting just after - // the last instance allocated. + /* + * There must be a zero bit somewhere now. Find it, starting just after + * the last instance allocated. + */ instance = find_next_zero_bit(words, bit_count, next_instance); if (instance >= bit_count) { int result; - // Nothing free after next_instance, so wrap around to instance - // zero. + /* + * Nothing free after next_instance, so wrap around to instance + * zero. + */ instance = find_first_zero_bit(words, bit_count); result = ASSERT(instance < bit_count, "impossibly, no zero bit found"); @@ -131,19 +120,28 @@ static int allocate_vdo_instance_locked(unsigned int *instance_ptr) return UDS_SUCCESS; } -/**********************************************************************/ -int allocate_vdo_instance(unsigned int *instance_ptr) +/** + * vdo_allocate_instance() - Allocate an instance number. + * @instance_ptr: An integer to hold the allocated instance number. + * + * Return: UDS_SUCCESS or an error code. + */ +int vdo_allocate_instance(unsigned int *instance_ptr) { int result; + mutex_lock(&instance_number_lock); - result = allocate_vdo_instance_locked(instance_ptr); + result = vdo_allocate_instance_locked(instance_ptr); mutex_unlock(&instance_number_lock); return result; } -/**********************************************************************/ -void release_vdo_instance(unsigned int instance) +/** + * vdo_release_instance() - Release an instance number previously allocated. + * @instance: The instance number to release. + */ +void vdo_release_instance(unsigned int instance) { mutex_lock(&instance_number_lock); if (instance >= bit_count) { @@ -162,14 +160,20 @@ void release_vdo_instance(unsigned int instance) mutex_unlock(&instance_number_lock); } -/**********************************************************************/ -void initialize_vdo_instance_number_tracking(void) +/** + * vdo_initialize_instance_number_tracking() - Initialize the instance-number + * tracking data structures. + */ +void vdo_initialize_instance_number_tracking(void) { mutex_init(&instance_number_lock); } -/**********************************************************************/ -void clean_up_vdo_instance_number_tracking(void) +/** + * vdo_clean_up_instance_number_tracking() - Free up the instance-number + * tracking data structures. + */ +void vdo_clean_up_instance_number_tracking(void) { ASSERT_LOG_ONLY(instance_count == 0, "should have no instance numbers still in use, but have %u", diff --git a/vdo/instance-number.h b/vdo/instance-number.h new file mode 100644 index 00000000..a3921052 --- /dev/null +++ b/vdo/instance-number.h @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright Red Hat + */ + +#ifndef INSTANCE_NUMBER_H +#define INSTANCE_NUMBER_H + +int vdo_allocate_instance(unsigned int *instance_ptr); + +void vdo_release_instance(unsigned int instance); + +void vdo_initialize_instance_number_tracking(void); + +void vdo_clean_up_instance_number_tracking(void); + +#endif /* INSTANCE_NUMBER_H */ diff --git a/vdo/instanceNumber.h b/vdo/instanceNumber.h deleted file mode 100644 index 5f8f621d..00000000 --- a/vdo/instanceNumber.h +++ /dev/null @@ -1,51 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/instanceNumber.h#7 $ - */ - -#ifndef INSTANCE_NUMBER_H -#define INSTANCE_NUMBER_H - -/** - * Allocate an instance number. - * - * @param [out] instance_ptr An integer to hold the allocated instance number - * - * @result UDS_SUCCESS or an error code - **/ -int allocate_vdo_instance(unsigned int *instance_ptr); - -/** - * Release an instance number previously allocated. - * - * @param instance The instance number to release - **/ -void release_vdo_instance(unsigned int instance); - -/** - * Initialize the instance-number tracking data structures. - **/ -void initialize_vdo_instance_number_tracking(void); - -/** - * Free up the instance-number tracking data structures. - **/ -void clean_up_vdo_instance_number_tracking(void); - -#endif // INSTANCE_NUMBER_H diff --git a/vdo/intMap.c b/vdo/int-map.c similarity index 51% rename from vdo/intMap.c rename to vdo/int-map.c index 3c62614e..8a32220d 100644 --- a/vdo/intMap.c +++ b/vdo/int-map.c @@ -1,25 +1,11 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/intMap.c#11 $ */ /** + * DOC: + * * Hash table implementation of a map from integers to pointers, implemented * using the Hopscotch Hashing algorithm by Herlihy, Shavit, and Tzafrir (see * http://en.wikipedia.org/wiki/Hopscotch_hashing). This implementation does @@ -64,71 +50,84 @@ * neighborhood. * * While individual accesses tend to be very fast, the table resize operations - * are very very expensive. If an upper bound on the latency of adding an + * are very, very expensive. If an upper bound on the latency of adding an * entry to the table is needed, we either need to ensure the table is * pre-sized to be large enough so no resize is ever needed, or we'll need to * develop an approach to incrementally resize the table. - **/ - -#include "intMap.h" + */ +#include "int-map.h" #include "errors.h" #include "logger.h" -#include "memoryAlloc.h" +#include "memory-alloc.h" #include "numeric.h" #include "permassert.h" enum { - DEFAULT_CAPACITY = 16, // the number of neighborhoods in a new table - NEIGHBORHOOD = 255, // the number of buckets in each neighborhood - MAX_PROBES = 1024, // limit on the number of probes for a free bucket - NULL_HOP_OFFSET = 0, // the hop offset value terminating the hop list - DEFAULT_LOAD = 75 // a compromise between memory use and performance + DEFAULT_CAPACITY = 16, /* the number of neighborhoods in a new table */ + NEIGHBORHOOD = 255, /* the number of buckets in each neighborhood */ + MAX_PROBES = 1024, /* limit on the number of probes for a free bucket */ + NULL_HOP_OFFSET = 0, /* the hop offset value terminating the hop list */ + DEFAULT_LOAD = 75 /* a compromise between memory use and performance */ }; /** + * struct bucket - hash bucket + * * Buckets are packed together to reduce memory usage and improve cache * efficiency. It would be tempting to encode the hop offsets separately and * maintain alignment of key/value pairs, but it's crucial to keep the hop * fields near the buckets that they use them so they'll tend to share cache * lines. - **/ + */ struct __packed bucket { - uint8_t first_hop; // the biased offset of the first entry in the hop - // list of the neighborhood that hashes to this bucket - uint8_t next_hop; // the biased offset of the next bucket in the - // hop list - - uint64_t key; // the key stored in this bucket - void *value; // the value stored in this bucket (NULL if empty) + /** + * @first_hop: The biased offset of the first entry in the hop list + * of the neighborhood that hashes to this bucket. + */ + uint8_t first_hop; + /** + * @next_hop: The biased offset of the next bucket in the hop list. + */ + uint8_t next_hop; + /** @key: The key stored in this bucket. */ + uint64_t key; + /** @value: The value stored in this bucket (NULL if empty). */ + void *value; }; /** - * The concrete definition of the opaque int_map type. To avoid having to wrap - * the neighborhoods of the last entries back around to the start of the - * bucket array, we allocate a few more buckets at the end of the array - * instead, which is why capacity and bucket_count are different. - **/ + * struct int_map - The concrete definition of the opaque int_map type. + * + * To avoid having to wrap the neighborhoods of the last entries back + * around to the start of the bucket array, we allocate a few more + * buckets at the end of the array instead, which is why capacity and + * bucket_count are different. + */ struct int_map { - size_t size; // the number of entries stored in the map - size_t capacity; // the number of neighborhoods in the map - size_t bucket_count; // the number of buckets in the bucket array - struct bucket *buckets; // the array of hash buckets + /** @size: The number of entries stored in the map. */ + size_t size; + /** @capacity: The number of neighborhoods in the map. */ + size_t capacity; + /* @bucket_count: The number of buckets in the bucket array. */ + size_t bucket_count; + /** @buckets: The array of hash buckets. */ + struct bucket *buckets; }; /** - * This is the Google CityHash 16-byte hash mixing function. - * - * @param input1 the first input value - * @param input2 the second input value + * mix() - The Google CityHash 16-byte hash mixing function. + * @input1: The first input value. + * @input2: The second input value. * - * @return a hash of the two inputs - **/ + * Return: A hash of the two inputs. + */ static uint64_t mix(uint64_t input1, uint64_t input2) { static const uint64_t CITY_MULTIPLIER = 0x9ddfea08eb382d69ULL; uint64_t hash = (input1 ^ input2); + hash *= CITY_MULTIPLIER; hash ^= (hash >> 47); hash ^= input2; @@ -139,18 +138,21 @@ static uint64_t mix(uint64_t input1, uint64_t input2) } /** - * Calculate a 64-bit non-cryptographic hash value for the provided 64-bit - * integer key. The implementation is based on Google's CityHash, only - * handling the specific case of an 8-byte input. + * hash_key() - Calculate a 64-bit non-cryptographic hash value for the + * provided 64-bit integer key. + * @key: The mapping key. * - * @param key the mapping key + * The implementation is based on Google's CityHash, only handling the + * specific case of an 8-byte input. * - * @return the hash of the mapping key - **/ + * Return: The hash of the mapping key. + */ static uint64_t hash_key(uint64_t key) { - // Aliasing restrictions forbid us from casting pointer types, so use a - // union to convert a single uint64_t to two uint32_t values. + /* + * Aliasing restrictions forbid us from casting pointer types, so use a + * union to convert a single uint64_t to two uint32_t values. + */ union { uint64_t u64; uint32_t u32[2]; @@ -159,26 +161,38 @@ static uint64_t hash_key(uint64_t key) } /** - * Initialize an int_map. - * - * @param map the map to initialize - * @param capacity the initial capacity of the map + * allocate_buckets() - Initialize an int_map. + * @map: The map to initialize. + * @capacity: The initial capacity of the map. * - * @return UDS_SUCCESS or an error code - **/ + * Return: UDS_SUCCESS or an error code. + */ static int allocate_buckets(struct int_map *map, size_t capacity) { map->size = 0; map->capacity = capacity; - // Allocate NEIGHBORHOOD - 1 extra buckets so the last bucket can have a - // full neighborhood without have to wrap back around to element zero. + /* + * Allocate NEIGHBORHOOD - 1 extra buckets so the last bucket can have a + * full neighborhood without have to wrap back around to element zero. + */ map->bucket_count = capacity + (NEIGHBORHOOD - 1); return UDS_ALLOCATE(map->bucket_count, struct bucket, "struct int_map buckets", &map->buckets); } -/**********************************************************************/ +/** + * make_int_map() - Allocate and initialize an int_map. + * @initial_capacity: The number of entries the map should initially be + * capable of holding (zero tells the map to use its own + * small default). + * @initial_load: The load factor of the map, expressed as an integer + * percentage (typically in the range 50 to 90, with zero + * telling the map to use its own default). + * @map_ptr: Output, a pointer to hold the new int_map. + * + * Return: UDS_SUCCESS or an error code. + */ int make_int_map(size_t initial_capacity, unsigned int initial_load, struct int_map **map_ptr) { @@ -186,7 +200,7 @@ int make_int_map(size_t initial_capacity, unsigned int initial_load, int result; size_t capacity; - // Use the default initial load if the caller did not specify one. + /* Use the default initial load if the caller did not specify one. */ if (initial_load == 0) { initial_load = DEFAULT_LOAD; } @@ -199,11 +213,13 @@ int make_int_map(size_t initial_capacity, unsigned int initial_load, return result; } - // Use the default capacity if the caller did not specify one. + /* Use the default capacity if the caller did not specify one. */ capacity = (initial_capacity > 0) ? initial_capacity : DEFAULT_CAPACITY; - // Scale up the capacity by the specified initial load factor. - // (i.e to hold 1000 entries at 80% load we need a capacity of 1250) + /* + * Scale up the capacity by the specified initial load factor. + * (i.e to hold 1000 entries at 80% load we need a capacity of 1250) + */ capacity = capacity * 100 / initial_load; result = allocate_buckets(map, capacity); @@ -216,7 +232,13 @@ int make_int_map(size_t initial_capacity, unsigned int initial_load, return UDS_SUCCESS; } -/**********************************************************************/ +/** + * free_int_map() - Free an int_map. + * @map: The int_map to free. + * + * NOTE: The map does not own the pointer values stored in the map and they + * are not freed by this call. + */ void free_int_map(struct int_map *map) { if (map == NULL) { @@ -227,22 +249,26 @@ void free_int_map(struct int_map *map) UDS_FREE(UDS_FORGET(map)); } -/**********************************************************************/ +/** + * int_map_size() - Get the number of entries stored in an int_map. + * @map: The int_map to query. + * + * Return: The number of entries in the map. + */ size_t int_map_size(const struct int_map *map) { return map->size; } /** - * Convert a biased hop offset within a neighborhood to a pointer to the - * bucket it references. - * - * @param neighborhood the first bucket in the neighborhood - * @param hop_offset the biased hop offset to the desired bucket + * dereference_hop() - Convert a biased hop offset within a neighborhood to + * a pointer to the bucket it references. + * @neighborhood: The first bucket in the neighborhood. + * @hop_offset: The biased hop offset to the desired bucket. * - * @return NULL if hop_offset is zero, otherwise a pointer to - * the bucket in the neighborhood at hop_offset - 1 - **/ + * Return: NULL if hop_offset is zero, otherwise a pointer to + * the bucket in the neighborhood at hop_offset - 1. + */ static struct bucket *dereference_hop(struct bucket *neighborhood, unsigned int hop_offset) { @@ -255,30 +281,35 @@ static struct bucket *dereference_hop(struct bucket *neighborhood, } /** - * Add a bucket into the hop list for the neighborhood, inserting it into the - * list so the hop list remains sorted by hop offset. + * insert_in_hop_list() - Add a bucket into the hop list for the neighborhood. + * @neighborhood: The first bucket in the neighborhood. + * @new_bucket: The bucket to add to the hop list. * - * @param neighborhood the first bucket in the neighborhood - * @param new_bucket the bucket to add to the hop list - **/ + * The bucket is inserted it into the list so the hop list remains sorted by + * hop offset. + */ static void insert_in_hop_list(struct bucket *neighborhood, struct bucket *new_bucket) { - // Zero indicates a NULL hop offset, so bias the hop offset by one. + /* Zero indicates a NULL hop offset, so bias the hop offset by one. */ int hop_offset = 1 + (new_bucket - neighborhood); - // Handle the special case of adding a bucket at the start of the list. + /* Handle the special case of adding a bucket at the start of the list. */ int next_hop = neighborhood->first_hop; + if ((next_hop == NULL_HOP_OFFSET) || (next_hop > hop_offset)) { new_bucket->next_hop = next_hop; neighborhood->first_hop = hop_offset; return; } - // Search the hop list for the insertion point that maintains the sort - // order. + /* + * Search the hop list for the insertion point that maintains the sort + * order. + */ for (;;) { struct bucket *bucket = dereference_hop(neighborhood, next_hop); + next_hop = bucket->next_hop; if ((next_hop == NULL_HOP_OFFSET) || (next_hop > hop_offset)) { @@ -290,15 +321,16 @@ static void insert_in_hop_list(struct bucket *neighborhood, } /** - * Select and return the hash bucket for a given search key. - * - * @param map the map to search - * @param key the mapping key - **/ + * select_bucket() - Select and return the hash bucket for a given search key. + * @map: The map to search. + * @key: The mapping key. + */ static struct bucket *select_bucket(const struct int_map *map, uint64_t key) { - // Calculate a good hash value for the provided key. We want exactly 32 - // bits, so mask the result. + /* + * Calculate a good hash value for the provided key. We want exactly 32 + * bits, so mask the result. + */ uint64_t hash = hash_key(key) & 0xFFFFFFFF; /* @@ -313,19 +345,19 @@ static struct bucket *select_bucket(const struct int_map *map, uint64_t key) } /** - * Search the hop list associated with given hash bucket for a given search - * key. If the key is found, returns a pointer to the entry (bucket or - * collision), otherwise returns NULL. - * - * @param [in] map the map being searched - * @param [in] bucket the map bucket to search for the key - * @param [in] key the mapping key - * @param [out] previous_ptr if not NULL, a pointer in which to - * store the bucket in the list preceding the one - * that had the matching key - * - * @return an entry that matches the key, or NULL if not found - **/ + * search_hop_list() - Search the hop list associated with given hash bucket + * for a given search key. + * @map: The map being searched. + * @bucket: The map bucket to search for the key. + * @key: The mapping key. + * @previous_ptr: Output. if not NULL, a pointer in which to store the bucket + * in the list preceding the one that had the matching key + * + * If the key is found, returns a pointer to the entry (bucket or collision), + * otherwise returns NULL. + * + * Return: An entry that matches the key, or NULL if not found. + */ static struct bucket * search_hop_list(struct int_map *map __attribute__((unused)), struct bucket *bucket, uint64_t key, @@ -333,10 +365,14 @@ search_hop_list(struct int_map *map __attribute__((unused)), { struct bucket *previous = NULL; unsigned int next_hop = bucket->first_hop; + while (next_hop != NULL_HOP_OFFSET) { - // Check the neighboring bucket indexed by the offset for the - // desired key. + /* + * Check the neighboring bucket indexed by the offset for the + * desired key. + */ struct bucket *entry = dereference_hop(bucket, next_hop); + if ((key == entry->key) && (entry->value != NULL)) { if (previous_ptr != NULL) { *previous_ptr = previous; @@ -349,7 +385,15 @@ search_hop_list(struct int_map *map __attribute__((unused)), return NULL; } -/**********************************************************************/ +/** + * int_map_get() - Retrieve the value associated with a given key from + * the int_map. + * @map: The int_map to query. + * @key: The key to look up. + * + * Return: The value associated with the given key, or NULL + * if the key is not mapped to any value. + */ void *int_map_get(struct int_map *map, uint64_t key) { struct bucket *match = @@ -358,21 +402,25 @@ void *int_map_get(struct int_map *map, uint64_t key) } /** - * Increase the number of hash buckets and rehash all the existing entries, - * storing them in the new buckets. + * resize_buckets() - Increase the number of hash buckets. + * @map: The map to resize. * - * @param map the map to resize - **/ + * Resizes and rehashes all the existing entries, storing them in the new + * buckets. + * + * Return: UDS_SUCCESS or an error code. + */ static int resize_buckets(struct int_map *map) { int result; size_t i; - // Copy the top-level map data to the stack. + /* Copy the top-level map data to the stack. */ struct int_map old_map = *map; - // Re-initialize the map to be empty and 50% larger. + /* Re-initialize the map to be empty and 50% larger. */ size_t new_capacity = map->capacity / 2 * 3; + uds_log_info("%s: attempting resize from %zu to %zu, current size=%zu", __func__, map->capacity, new_capacity, map->size); result = allocate_buckets(map, new_capacity); @@ -381,51 +429,57 @@ static int resize_buckets(struct int_map *map) return result; } - // Populate the new hash table from the entries in the old bucket array. + /* Populate the new hash table from the entries in the old bucket array. */ for (i = 0; i < old_map.bucket_count; i++) { struct bucket *entry = &old_map.buckets[i]; + if (entry->value == NULL) { continue; } result = int_map_put(map, entry->key, entry->value, true, NULL); if (result != UDS_SUCCESS) { - // Destroy the new partial map and restore the map from - // the stack. + /* + * Destroy the new partial map and restore the map from + * the stack. + */ UDS_FREE(UDS_FORGET(map->buckets)); *map = old_map; return result; } } - // Destroy the old bucket array. + /* Destroy the old bucket array. */ UDS_FREE(UDS_FORGET(old_map.buckets)); return UDS_SUCCESS; } /** - * Probe the bucket array starting at the given bucket for the next empty - * bucket, returning a pointer to it. NULL will be returned if - * the search reaches the end of the bucket array or if the number of linear - * probes exceeds a specified limit. + * find_empty_bucket() - Probe the bucket array starting at the given bucket + * for the next empty bucket, returning a pointer to it. + * @map: The map containing the buckets to search. + * @bucket: The bucket at which to start probing. + * @max_probes: The maximum number of buckets to search. * - * @param map the map containing the buckets to search - * @param bucket the bucket at which to start probing - * @param max_probes the maximum number of buckets to search + * NULL will be returned if the search reaches the end of the bucket array or + * if the number of linear probes exceeds a specified limit. * - * @return the next empty bucket, or NULL if the search failed - **/ + * Return: The next empty bucket, or NULL if the search failed. + */ static struct bucket *find_empty_bucket(struct int_map *map, struct bucket *bucket, unsigned int max_probes) { - // Limit the search to either the nearer of the end of the bucket array - // or a fixed distance beyond the initial bucket. + /* + * Limit the search to either the nearer of the end of the bucket array + * or a fixed distance beyond the initial bucket. + */ ptrdiff_t remaining = &map->buckets[map->bucket_count] - bucket; struct bucket *sentinel = &bucket[min(remaining, (ptrdiff_t) max_probes)]; struct bucket *entry; + for (entry = bucket; entry < sentinel; entry++) { if (entry->value == NULL) { return entry; @@ -435,18 +489,19 @@ static struct bucket *find_empty_bucket(struct int_map *map, } /** - * Move an empty bucket closer to the start of the bucket array. This searches - * the neighborhoods that contain the empty bucket for a non-empty bucket - * closer to the start of the array. If such a bucket is found, this swaps the - * two buckets by moving the entry to the empty bucket. - * - * @param map the map containing the bucket - * @param hole the empty bucket to fill with an entry that precedes it in one - * of its enclosing neighborhoods - * - * @return the bucket that was vacated by moving its entry to the provided - * hole, or NULL if no entry could be moved - **/ + * move_empty_bucket() - Move an empty bucket closer to the start of the + * bucket array. + * @map: The map containing the bucket. + * @hole: The empty bucket to fill with an entry that precedes it in one + * of its enclosing neighborhoods. + * + * This searches the neighborhoods that contain the empty bucket for a + * non-empty bucket closer to the start of the array. If such a bucket is + * found, this swaps the two buckets by moving the entry to the empty bucket. + * + * Return: The bucket that was vacated by moving its entry to the provided + * hole, or NULL if no entry could be moved. + */ static struct bucket * move_empty_bucket(struct int_map *map __attribute__((unused)), struct bucket *hole) @@ -459,21 +514,28 @@ move_empty_bucket(struct int_map *map __attribute__((unused)), * than a valid bucket. */ struct bucket *bucket; + for (bucket = &hole[1 - NEIGHBORHOOD]; bucket < hole; bucket++) { - // Find the entry that is nearest to the bucket, which means it - // will be nearest to the hash bucket whose neighborhood is - // full. + /* + * Find the entry that is nearest to the bucket, which means it + * will be nearest to the hash bucket whose neighborhood is + * full. + */ struct bucket *new_hole = dereference_hop(bucket, bucket->first_hop); if (new_hole == NULL) { - // There are no buckets in this neighborhood that are in - // use by this one (they must all be owned by - // overlapping neighborhoods). + /* + * There are no buckets in this neighborhood that are in + * use by this one (they must all be owned by + * overlapping neighborhoods). + */ continue; } - // Skip this bucket if its first entry is actually further away - // than the hole that we're already trying to fill. + /* + * Skip this bucket if its first entry is actually further away + * than the hole that we're already trying to fill. + */ if (hole < new_hole) { continue; } @@ -484,55 +546,62 @@ move_empty_bucket(struct int_map *map __attribute__((unused)), * not all the way into its neighborhood. */ - // The entry that will be the new hole is the first bucket in - // the list, so setting first_hop is all that's needed remove it - // from the list. + /* + * The entry that will be the new hole is the first bucket in + * the list, so setting first_hop is all that's needed remove it + * from the list. + */ bucket->first_hop = new_hole->next_hop; new_hole->next_hop = NULL_HOP_OFFSET; - // Move the entry into the original hole. + /* Move the entry into the original hole. */ hole->key = new_hole->key; hole->value = new_hole->value; new_hole->value = NULL; - // Insert the filled hole into the hop list for the - // neighborhood. + /* + * Insert the filled hole into the hop list for the + * neighborhood. + */ insert_in_hop_list(bucket, hole); return new_hole; } - // We couldn't find an entry to relocate to the hole. + /* We couldn't find an entry to relocate to the hole. */ return NULL; } /** - * Find and update any existing mapping for a given key, returning the value - * associated with the key in the provided pointer. - * - * @param [in] map the int_map to attempt to modify - * @param [in] neighborhood the first bucket in the neighborhood that - * would contain the search key - * @param [in] key the key with which to associate the new value - * @param [in] new_value the value to be associated with the key - * @param [in] update whether to overwrite an existing value - * @param [out] old_value_ptr a pointer in which to store the old value - * (unmodified if no mapping was found) - * - * @return true if the map contains a mapping for the key - * false if it does not - **/ + * update_mapping() - Find and update any existing mapping for a given key, + * returning the value associated with the key in the + * provided pointer. + * @map: The int_map to attempt to modify. + * @neighborhood: The first bucket in the neighborhood that would contain the + * search key + * @key: The key with which to associate the new value. + * @new_value: The value to be associated with the key. + * @update: Whether to overwrite an existing value. + * @old_value_ptr: a pointer in which to store the old value (unmodified if no + * mapping was found) + * + * Return: true if the map contains a mapping for the key, false if it does + * not. + */ static bool update_mapping(struct int_map *map, struct bucket *neighborhood, uint64_t key, void *new_value, bool update, void **old_value_ptr) { struct bucket *bucket = search_hop_list(map, neighborhood, key, NULL); + if (bucket == NULL) { - // There is no bucket containing the key in the neighborhood. + /* There is no bucket containing the key in the neighborhood. */ return false; } - // Return the value of the current mapping (if desired) and update the - // mapping with the new value (if desired). + /* + * Return the value of the current mapping (if desired) and update the + * mapping with the new value (if desired). + */ if (old_value_ptr != NULL) { *old_value_ptr = bucket->value; } @@ -543,60 +612,91 @@ static bool update_mapping(struct int_map *map, struct bucket *neighborhood, } /** + * find_or_make_vacancy() - Find an empty bucket. + * @map: The int_map to search or modify. + * @neighborhood: The first bucket in the neighborhood in which + * an empty bucket is needed for a new mapping. + * * Find an empty bucket in a specified neighborhood for a new mapping or * attempt to re-arrange mappings so there is such a bucket. This operation * may fail (returning NULL) if an empty bucket is not available or could not * be relocated to the neighborhood. * - * @param map the int_map to search or modify - * @param neighborhood the first bucket in the neighborhood in which - * an empty bucket is needed for a new mapping - * - * @return a pointer to an empty bucket in the desired neighborhood, or - * NULL if a vacancy could not be found or arranged - **/ + * Return: a pointer to an empty bucket in the desired neighborhood, or + * NULL if a vacancy could not be found or arranged. + */ static struct bucket *find_or_make_vacancy(struct int_map *map, struct bucket *neighborhood) { - // Probe within and beyond the neighborhood for the first empty bucket. + /* Probe within and beyond the neighborhood for the first empty bucket. */ struct bucket *hole = find_empty_bucket(map, neighborhood, MAX_PROBES); - // Keep trying until the empty bucket is in the bucket's neighborhood or - // we are unable to move it any closer by swapping it with a filled - // bucket. + /* + * Keep trying until the empty bucket is in the bucket's neighborhood or + * we are unable to move it any closer by swapping it with a filled + * bucket. + */ while (hole != NULL) { int distance = hole - neighborhood; + if (distance < NEIGHBORHOOD) { - // We've found or relocated an empty bucket close enough - // to the initial hash bucket to be referenced by its - // hop vector. + /* + * We've found or relocated an empty bucket close enough + * to the initial hash bucket to be referenced by its + * hop vector. + */ return hole; } - // The nearest empty bucket isn't within the neighborhood that - // must contain the new entry, so try to swap it with bucket - // that is closer. + /* + * The nearest empty bucket isn't within the neighborhood that + * must contain the new entry, so try to swap it with bucket + * that is closer. + */ hole = move_empty_bucket(map, hole); } return NULL; } -/**********************************************************************/ +/** + * int_map_put() - Try to associate a value with an integer. + * @map: The int_map to attempt to modify. + * @key: The key with which to associate the new value. + * @new_value: The value to be associated with the key. + * @update: Whether to overwrite an existing value. + * @old_value_ptr: A pointer in which to store either the old value (if the + * key was already mapped) or NULL if the map did not contain + * the key; NULL may be provided if the caller does not need + * to know the old value + * + * Try to associate a value (a pointer) with an integer in an int_map. If the + * map already contains a mapping for the provided key, the old value is + * only replaced with the specified value if update is true. In either case + * the old value is returned. If the map does not already contain a value for + * the specified key, the new value is added regardless of the value of update. + * + * Return: UDS_SUCCESS or an error code. + */ int int_map_put(struct int_map *map, uint64_t key, void *new_value, bool update, void **old_value_ptr) { struct bucket *neighborhood, *bucket; + if (new_value == NULL) { return UDS_INVALID_ARGUMENT; } - // Select the bucket at the start of the neighborhood that must contain - // any entry for the provided key. + /* + * Select the bucket at the start of the neighborhood that must contain + * any entry for the provided key. + */ neighborhood = select_bucket(map, key); - // Check whether the neighborhood already contains an entry for the key, - // in which case we optionally update it, returning the old value. + /* + * Check whether the neighborhood already contains an entry for the key, + * in which case we optionally update it, returning the old value. + */ if (update_mapping(map, neighborhood, key, new_value, update, old_value_ptr)) { return UDS_SUCCESS; @@ -617,55 +717,71 @@ int int_map_put(struct int_map *map, uint64_t key, void *new_value, bool update, * maps). */ int result = resize_buckets(map); + if (result != UDS_SUCCESS) { return result; } - // Resizing the map invalidates all pointers to buckets, so - // recalculate the neighborhood pointer. + /* + * Resizing the map invalidates all pointers to buckets, so + * recalculate the neighborhood pointer. + */ neighborhood = select_bucket(map, key); } - // Put the new entry in the empty bucket, adding it to the neighborhood. + /* Put the new entry in the empty bucket, adding it to the neighborhood. */ bucket->key = key; bucket->value = new_value; insert_in_hop_list(neighborhood, bucket); map->size += 1; - // There was no existing entry, so there was no old value to be - // returned. + /* + * There was no existing entry, so there was no old value to be + * returned. + */ if (old_value_ptr != NULL) { *old_value_ptr = NULL; } return UDS_SUCCESS; } -/**********************************************************************/ +/** + * int_map_remove() - Remove the mapping for a given key from the int_map. + * @map: The int_map from which to remove the mapping. + * @key: The key whose mapping is to be removed. + * + * Return: the value that was associated with the key, or + * NULL if it was not mapped. + */ void *int_map_remove(struct int_map *map, uint64_t key) { void *value; - // Select the bucket to search and search it for an existing entry. + /* Select the bucket to search and search it for an existing entry. */ struct bucket *bucket = select_bucket(map, key); struct bucket *previous; struct bucket *victim = search_hop_list(map, bucket, key, &previous); if (victim == NULL) { - // There is no matching entry to remove. + /* There is no matching entry to remove. */ return NULL; } - // We found an entry to remove. Save the mapped value to return later - // and empty the bucket. + /* + * We found an entry to remove. Save the mapped value to return later + * and empty the bucket. + */ map->size -= 1; value = victim->value; victim->value = NULL; victim->key = 0; - // The victim bucket is now empty, but it still needs to be spliced out - // of the hop list. + /* + * The victim bucket is now empty, but it still needs to be spliced out + * of the hop list. + */ if (previous == NULL) { - // The victim is the head of the list, so swing first_hop. + /* The victim is the head of the list, so swing first_hop. */ bucket->first_hop = victim->next_hop; } else { previous->next_hop = victim->next_hop; diff --git a/vdo/int-map.h b/vdo/int-map.h new file mode 100644 index 00000000..42016e9f --- /dev/null +++ b/vdo/int-map.h @@ -0,0 +1,45 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright Red Hat + */ + +#ifndef INT_MAP_H +#define INT_MAP_H + +#include "compiler.h" +#include "type-defs.h" + +/** + * DOC: int_map + * + * An int_map associates pointers (void *) with integer keys (uint64_t). NULL + * pointer values are not supported. + * + * The map is implemented as hash table, which should provide constant-time + * insert, query, and remove operations, although the insert may occasionally + * grow the table, which is linear in the number of entries in the map. The + * table will grow as needed to hold new entries, but will not shrink as + * entries are removed. + */ + +struct int_map; + +int __must_check make_int_map(size_t initial_capacity, + unsigned int initial_load, + struct int_map **map_ptr); + +void free_int_map(struct int_map *map); + +size_t int_map_size(const struct int_map *map); + +void *int_map_get(struct int_map *map, uint64_t key); + +int __must_check int_map_put(struct int_map *map, + uint64_t key, + void *new_value, + bool update, + void **old_value_ptr); + +void *int_map_remove(struct int_map *map, uint64_t key); + +#endif /* INT_MAP_H */ diff --git a/vdo/intMap.h b/vdo/intMap.h deleted file mode 100644 index a5094a62..00000000 --- a/vdo/intMap.h +++ /dev/null @@ -1,123 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/intMap.h#7 $ - */ - -#ifndef INT_MAP_H -#define INT_MAP_H - -#include "common.h" - -/** - * An int_map associates pointers (void *) with integer keys - * (uint64_t). NULL pointer values are not - * supported. - * - * The map is implemented as hash table, which should provide constant-time - * insert, query, and remove operations, although the insert may occasionally - * grow the table, which is linear in the number of entries in the map. The - * table will grow as needed to hold new entries, but will not shrink as - * entries are removed. - **/ - -struct int_map; - -/** - * Allocate and initialize an int_map. - * - * @param [in] initial_capacity the number of entries the map should - * initially be capable of holding (zero tells - * the map to use its own small default) - * @param [in] initial_load the load factor of the map, expressed as an - * integer percentage (typically in the range - * 50 to 90, with zero telling the map to use - * its own default) - * @param [out] map_ptr a pointer to hold the new int_map - * - * @return UDS_SUCCESS or an error code - **/ -int __must_check make_int_map(size_t initial_capacity, - unsigned int initial_load, - struct int_map **map_ptr); - -/** - * Free an int_map. NOTE: The map does not own the pointer values stored in the - * map and they are not freed by this call. - * - * @param map The int_map to free - **/ -void free_int_map(struct int_map *map); - -/** - * Get the number of entries stored in an int_map. - * - * @param map the int_map to query - * - * @return the number of entries in the map - **/ -size_t int_map_size(const struct int_map *map); - -/** - * Retrieve the value associated with a given key from the int_map. - * - * @param map the int_map to query - * @param key the key to look up - * - * @return the value associated with the given key, or NULL - * if the key is not mapped to any value - **/ -void *int_map_get(struct int_map *map, uint64_t key); - -/** - * Try to associate a value (a pointer) with an integer in an int_map. If the - * map already contains a mapping for the provided key, the old value is - * only replaced with the specified value if update is true. In either case - * the old value is returned. If the map does not already contain a value for - * the specified key, the new value is added regardless of the value of update. - * - * @param [in] map the int_map to attempt to modify - * @param [in] key the key with which to associate the new value - * @param [in] new_value the value to be associated with the key - * @param [in] update whether to overwrite an existing value - * @param [out] old_value_ptr a pointer in which to store either the old value - * (if the key was already mapped) or - * NULL if the map did not contain the - * key; NULL may be provided if the - * caller does not need to know the old value - * - * @return UDS_SUCCESS or an error code - **/ -int __must_check int_map_put(struct int_map *map, - uint64_t key, - void *new_value, - bool update, - void **old_value_ptr); - -/** - * Remove the mapping for a given key from the int_map. - * - * @param map the int_map from which to remove the mapping - * @param key the key whose mapping is to be removed - * - * @return the value that was associated with the key, or - * NULL if it was not mapped - **/ -void *int_map_remove(struct int_map *map, uint64_t key); - -#endif /* INT_MAP_H */ diff --git a/uds/ioFactoryLinuxKernel.c b/vdo/io-factory.c similarity index 71% rename from uds/ioFactoryLinuxKernel.c rename to vdo/io-factory.c index 423dfb81..0833f41f 100644 --- a/uds/ioFactoryLinuxKernel.c +++ b/vdo/io-factory.c @@ -1,31 +1,15 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/krusty/kernelLinux/uds/ioFactoryLinuxKernel.c#19 $ */ #include #include #include -#include "ioFactory.h" +#include "io-factory.h" #include "logger.h" -#include "memoryAlloc.h" +#include "memory-alloc.h" enum { BLK_FMODE = FMODE_READ | FMODE_WRITE }; @@ -38,13 +22,11 @@ struct io_factory { atomic_t ref_count; }; -/**********************************************************************/ void get_uds_io_factory(struct io_factory *factory) { atomic_inc(&factory->ref_count); } -/**********************************************************************/ static int get_block_device_from_name(const char *name, struct block_device **bdev) { @@ -64,14 +46,13 @@ static int get_block_device_from_name(const char *name, return UDS_SUCCESS; } -/**********************************************************************/ int make_uds_io_factory(const char *path, struct io_factory **factory_ptr) { int result; struct block_device *bdev; struct io_factory *factory; - result = get_block_device_from_name(path, &bdev); + result = get_block_device_from_name(path, &bdev); if (result != UDS_SUCCESS) { return result; } @@ -89,13 +70,12 @@ int make_uds_io_factory(const char *path, struct io_factory **factory_ptr) return UDS_SUCCESS; } -/**********************************************************************/ int replace_uds_storage(struct io_factory *factory, const char *path) { int result; struct block_device *bdev; - result = get_block_device_from_name(path, &bdev); + result = get_block_device_from_name(path, &bdev); if (result != UDS_SUCCESS) { return result; } @@ -105,7 +85,6 @@ int replace_uds_storage(struct io_factory *factory, const char *path) return UDS_SUCCESS; } -/**********************************************************************/ void put_uds_io_factory(struct io_factory *factory) { if (atomic_add_return(-1, &factory->ref_count) <= 0) { @@ -114,13 +93,11 @@ void put_uds_io_factory(struct io_factory *factory) } } -/**********************************************************************/ size_t get_uds_writable_size(struct io_factory *factory) { return i_size_read(factory->bdev->bd_inode); } -/**********************************************************************/ int make_uds_bufio(struct io_factory *factory, off_t offset, size_t block_size, @@ -128,6 +105,7 @@ int make_uds_bufio(struct io_factory *factory, struct dm_bufio_client **client_ptr) { struct dm_bufio_client *client; + if (offset % SECTOR_SIZE != 0) { return uds_log_error_strerror(UDS_INCORRECT_ALIGNMENT, "offset %zd not multiple of %d", @@ -153,7 +131,6 @@ int make_uds_bufio(struct io_factory *factory, return UDS_SUCCESS; } -/**********************************************************************/ int open_uds_buffered_reader(struct io_factory *factory, off_t offset, size_t size, @@ -161,6 +138,7 @@ int open_uds_buffered_reader(struct io_factory *factory, { int result; struct dm_bufio_client *client = NULL; + if (size % UDS_BLOCK_SIZE != 0) { return uds_log_error_strerror( UDS_INCORRECT_ALIGNMENT, @@ -182,7 +160,6 @@ int open_uds_buffered_reader(struct io_factory *factory, return result; } -/**********************************************************************/ int open_uds_buffered_writer(struct io_factory *factory, off_t offset, size_t size, @@ -190,6 +167,7 @@ int open_uds_buffered_writer(struct io_factory *factory, { int result; struct dm_bufio_client *client = NULL; + if (size % UDS_BLOCK_SIZE != 0) { return uds_log_error_strerror(UDS_INCORRECT_ALIGNMENT, "region size %zd is not multiple of %d", diff --git a/uds/ioFactory.h b/vdo/io-factory.h similarity index 82% rename from uds/ioFactory.h rename to vdo/io-factory.h index 85e4381b..128754ca 100644 --- a/uds/ioFactory.h +++ b/vdo/io-factory.h @@ -1,29 +1,13 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/krusty/src/uds/ioFactory.h#13 $ */ #ifndef IO_FACTORY_H #define IO_FACTORY_H -#include "bufferedReader.h" -#include "bufferedWriter.h" +#include "buffered-reader.h" +#include "buffered-writer.h" #include /* @@ -145,4 +129,4 @@ int __must_check open_uds_buffered_writer(struct io_factory *factory, size_t size, struct buffered_writer **writer_ptr); -#endif // IO_FACTORY_H +#endif /* IO_FACTORY_H */ diff --git a/vdo/io-submitter.c b/vdo/io-submitter.c new file mode 100644 index 00000000..545169e8 --- /dev/null +++ b/vdo/io-submitter.c @@ -0,0 +1,558 @@ +// SPDX-License-Identifier: GPL-2.0-only + /* + * Copyright Red Hat + */ + +#include "io-submitter.h" + +#include +#include +#include +#include + +#include "memory-alloc.h" +#include "permassert.h" + +#include "atomic-stats.h" +#include "bio.h" +#include "data-vio.h" +#include "logger.h" +#include "types.h" +#include "vdo.h" +#include "vio.h" + +/* + * Submission of bio operations to the underlying storage device will + * go through a separate work queue thread (or more than one) to + * prevent blocking in other threads if the storage device has a full + * queue. The plug structure allows that thread to do better batching + * of requests to make the I/O more efficient. + * + * When multiple worker threads are used, a thread is chosen for a + * I/O operation submission based on the PBN, so a given PBN will + * consistently wind up on the same thread. Flush operations are + * assigned round-robin. + * + * The map (protected by the mutex) collects pending I/O operations so + * that the worker thread can reorder them to try to encourage I/O + * request merging in the request queue underneath. + */ +struct bio_queue_data { + struct vdo_work_queue *queue; + struct blk_plug plug; + struct int_map *map; + struct mutex lock; + unsigned int queue_number; +}; + +struct io_submitter { + unsigned int num_bio_queues_used; + unsigned int bio_queue_rotation_interval; + struct bio_queue_data bio_queue_data[]; +}; + +static void start_bio_queue(void *ptr) +{ + struct bio_queue_data *bio_queue_data = (struct bio_queue_data *) ptr; + + blk_start_plug(&bio_queue_data->plug); +} + +static void finish_bio_queue(void *ptr) +{ + struct bio_queue_data *bio_queue_data = (struct bio_queue_data *) ptr; + + blk_finish_plug(&bio_queue_data->plug); +} + +static const struct vdo_work_queue_type bio_queue_type = { + .start = start_bio_queue, + .finish = finish_bio_queue, + .max_priority = BIO_Q_MAX_PRIORITY, + .default_priority = BIO_Q_DATA_PRIORITY, +}; + +/** + * count_all_bios() - Determine which bio counter to use. + * @vio: The vio associated with the bio. + * @bio: The bio to count. + */ +static void count_all_bios(struct vio *vio, struct bio *bio) +{ + struct atomic_statistics *stats = &vdo_from_vio(vio)->stats; + + if (is_data_vio(vio)) { + vdo_count_bios(&stats->bios_out, bio); + return; + } + + vdo_count_bios(&stats->bios_meta, bio); + if (vio->type == VIO_TYPE_RECOVERY_JOURNAL) { + vdo_count_bios(&stats->bios_journal, bio); + } else if (vio->type == VIO_TYPE_BLOCK_MAP) { + vdo_count_bios(&stats->bios_page_cache, bio); + } +} + +/** + * assert_in_bio_zone() - Assert that a vio is in the correct bio zone and + * not in interrupt context. + * @vio: The vio to check. + */ +static void assert_in_bio_zone(struct vio *vio) +{ + ASSERT_LOG_ONLY(!in_interrupt(), "not in interrupt context"); + assert_vio_in_bio_zone(vio); +} + +/** + * send_bio_to_device() - Update stats and tracing info, then submit the + * supplied bio to the OS for processing. + * @vio: The vio associated with the bio. + * @bio: The bio to submit to the OS. + */ +static void send_bio_to_device(struct vio *vio, + struct bio *bio) +{ + struct vdo *vdo = vdo_from_vio(vio); + + assert_in_bio_zone(vio); + atomic64_inc(&vdo->stats.bios_submitted); + count_all_bios(vio, bio); + bio_set_dev(bio, vdo_get_backing_device(vdo)); + submit_bio_noacct(bio); +} + +static sector_t get_bio_sector(struct bio *bio) +{ + return bio->bi_iter.bi_sector; +} + +/** + * process_vio_io() - Submits a vio's bio to the underlying block device. May + * block if the device is busy. This callback should be used + * by vios which did not attempt to merge. + **/ +void process_vio_io(struct vdo_completion *completion) +{ + struct vio *vio = as_vio(completion); + + send_bio_to_device(vio, vio->bio); +} + +/** + * get_bio_list() - Extract the list of bios to submit from a vio. + * @vio: The vio submitting I/O. + * + * The list will always contain at least one entry (the bio for the + * vio on which it is called), but other bios may have been merged + * with it as well. + * + * Return: bio The head of the bio list to submit. + */ +static struct bio *get_bio_list(struct vio *vio) +{ + struct bio *bio; + struct io_submitter *submitter = vdo_from_vio(vio)->io_submitter; + struct bio_queue_data *bio_queue_data + = &(submitter->bio_queue_data[vio->bio_zone]); + + assert_in_bio_zone(vio); + + mutex_lock(&bio_queue_data->lock); + int_map_remove(bio_queue_data->map, + get_bio_sector(vio->bios_merged.head)); + int_map_remove(bio_queue_data->map, + get_bio_sector(vio->bios_merged.tail)); + bio = vio->bios_merged.head; + bio_list_init(&vio->bios_merged); + mutex_unlock(&bio_queue_data->lock); + + return bio; +} + +/** + * process_data_vio_io() - Submit a data_vio's bio to the storage below + * along with any bios that have been merged with it. + * + * Context: This call may block and so should only be called from a + * bio thread. + */ +static void process_data_vio_io(struct vdo_completion *completion) +{ + struct bio *bio, *next; + struct vio *vio = as_vio(completion); + + assert_in_bio_zone(vio); + for (bio = get_bio_list(vio); bio != NULL; bio = next) { + next = bio->bi_next; + bio->bi_next = NULL; + send_bio_to_device((struct vio *) bio->bi_private, bio); + } +} + +/** + * get_mergeable_locked() - Attempt to find an already queued bio that the + * current bio can be merged with. + * @map: The bio map to use for merging. + * @vio: The vio we want to merge. + * @back_merge: Set to true for a back merge, false for a front merge. + * + * There are two types of merging possible, forward and backward, + * which are distinguished by a flag that uses kernel elevator + * terminology. + * + * Return: the vio to merge to, NULL if no merging is possible. + */ +static struct vio *get_mergeable_locked(struct int_map *map, + struct vio *vio, + bool back_merge) +{ + struct bio *bio = vio->bio; + sector_t merge_sector = get_bio_sector(bio); + struct vio *vio_merge; + + if (back_merge) { + merge_sector -= VDO_SECTORS_PER_BLOCK; + } else { + merge_sector += VDO_SECTORS_PER_BLOCK; + } + + vio_merge = int_map_get(map, merge_sector); + + if (vio_merge == NULL) { + return NULL; + } + + if (vio_as_completion(vio)->priority + != vio_as_completion(vio_merge)->priority) { + return NULL; + } + + if (bio_data_dir(bio) != bio_data_dir(vio_merge->bio)) { + return NULL; + } + + if (bio_list_empty(&vio_merge->bios_merged)) { + return NULL; + } + + if (back_merge) { + return ((get_bio_sector(vio_merge->bios_merged.tail) == + merge_sector) ? vio_merge : NULL); + } + + return ((get_bio_sector(vio_merge->bios_merged.head) == + merge_sector) ? vio_merge : NULL); +} + +static int merge_to_prev_tail(struct int_map *bio_map, + struct vio *vio, + struct vio *prev_vio) +{ + int result; + + int_map_remove(bio_map, get_bio_sector(prev_vio->bios_merged.tail)); + bio_list_merge(&prev_vio->bios_merged, &vio->bios_merged); + result = int_map_put(bio_map, + get_bio_sector(prev_vio->bios_merged.head), + prev_vio, true, NULL); + result = int_map_put(bio_map, + get_bio_sector(prev_vio->bios_merged.tail), + prev_vio, true, NULL); + return result; +} + +static int merge_to_next_head(struct int_map *bio_map, + struct vio *vio, + struct vio *next_vio) +{ + int result; + + /* + * Handle "next merge" and "gap fill" cases the same way so as to + * reorder bios in a way that's compatible with using funnel queues + * in work queues. This avoids removing an existing completion. + */ + int_map_remove(bio_map, get_bio_sector(next_vio->bios_merged.head)); + bio_list_merge_head(&next_vio->bios_merged, &vio->bios_merged); + result = int_map_put(bio_map, + get_bio_sector(next_vio->bios_merged.head), + next_vio, true, NULL); + result = int_map_put(bio_map, + get_bio_sector(next_vio->bios_merged.tail), + next_vio, true, NULL); + return result; +} + +/** + * try_bio_map_merge() - Attempt to merge a vio's bio with other pending I/Os. + * @vio: The vio to merge. + * + * Currently this is only used for data_vios, but is broken out for + * future use with metadata vios. + * + * Return: whether or not the vio was merged. + */ +static bool try_bio_map_merge(struct vio *vio) +{ + int result; + bool merged = true; + struct bio *bio = vio->bio; + struct vio *prev_vio, *next_vio; + struct vdo *vdo = vdo_from_vio(vio); + struct bio_queue_data *bio_queue_data + = &vdo->io_submitter->bio_queue_data[vio->bio_zone]; + + bio->bi_next = NULL; + bio_list_init(&vio->bios_merged); + bio_list_add(&vio->bios_merged, bio); + + mutex_lock(&bio_queue_data->lock); + prev_vio = get_mergeable_locked(bio_queue_data->map, vio, true); + next_vio = get_mergeable_locked(bio_queue_data->map, vio, false); + if (prev_vio == next_vio) { + next_vio = NULL; + } + + if ((prev_vio == NULL) && (next_vio == NULL)) { + /* no merge. just add to bio_queue */ + merged = false; + result = int_map_put(bio_queue_data->map, get_bio_sector(bio), + vio, true, NULL); + } else if (next_vio == NULL) { + /* Only prev. merge to prev's tail */ + result = merge_to_prev_tail(bio_queue_data->map, + vio, + prev_vio); + } else { + /* Only next. merge to next's head */ + result = merge_to_next_head(bio_queue_data->map, + vio, + next_vio); + } + + mutex_unlock(&bio_queue_data->lock); + + /* We don't care about failure of int_map_put in this case. */ + ASSERT_LOG_ONLY(result == UDS_SUCCESS, "bio map insertion succeeds"); + return merged; +} + +/** + * submit_data_vio_io() - Submit I/O for a data_vio. + * @data_vio: the data_vio for which to issue I/O. + * + * If possible, this I/O will be merged other pending I/Os. Otherwise, + * the data_vio will be sent to the appropriate bio zone directly. + */ +void submit_data_vio_io(struct data_vio *data_vio) +{ + if (try_bio_map_merge(data_vio_as_vio(data_vio))) { + return; + } + + launch_data_vio_bio_zone_callback(data_vio, + process_data_vio_io); + +} + +/** + * vdo_submit_metadata_io() - Submit I/O for a metadata vio. + * + * The vio is enqueued on a vdo bio queue so that bio submission (which may + * block) does not block other vdo threads. + * + * That the error handler will run on the correct thread is only true so long + * as the thread calling this function, and the thread set in the endio + * callback are the same, as well as the fact that no error can occur on the + * bio queue. Currently this is true for all callers, but additional care will + * be needed if this ever changes. + + * @vio: the vio for which to issue I/O + * @physical: the physical block number to read or write + * @callback: the bio endio function which will be called after the I/O + * completes + * @error_handler: the handler for submission or I/O errors (may be NULL) + * @operation: the type of I/O to perform + * @data: the buffer to read or write (may be NULL) + **/ +void vdo_submit_metadata_io(struct vio *vio, + physical_block_number_t physical, + bio_end_io_t callback, + vdo_action *error_handler, + unsigned int operation, + char *data) +{ + struct vdo_completion *completion = vio_as_completion(vio); + int result; + const struct admin_state_code *code = + vdo_get_admin_state(completion->vdo); + + + ASSERT_LOG_ONLY(!code->quiescent, + "I/O not allowed in state %s", + code->name); + ASSERT_LOG_ONLY(vio->bio->bi_next == NULL, + "metadata bio has no next bio"); + + vdo_reset_completion(completion); + completion->error_handler = error_handler; + + set_vio_physical(vio, physical); + result = vdo_reset_bio_with_buffer(vio->bio, + data, + vio, + callback, + operation | REQ_META, + vio->physical); + if (result != VDO_SUCCESS) { + continue_vio(vio, result); + return; + } + + vdo_set_completion_callback(completion, + process_vio_io, + get_vio_bio_zone_thread_id(vio)); + vdo_invoke_completion_callback_with_priority(completion, + get_metadata_priority(vio)); +} + +/** + * vdo_make_io_submitter() - Create an io_submitter structure. + * + * @thread_count: Number of bio-submission threads to set up. + * @rotation_interval: Interval to use when rotating between bio-submission + * threads when enqueuing completions. + * @max_requests_active: Number of bios for merge tracking. + * @vdo: The vdo which will use this submitter. + * @io_submitter: , pointer to the new data structure. + * + * Return: VDO_SUCCESS or an error. + */ +int vdo_make_io_submitter(unsigned int thread_count, + unsigned int rotation_interval, + unsigned int max_requests_active, + struct vdo *vdo, + struct io_submitter **io_submitter_ptr) +{ + unsigned int i; + struct io_submitter *io_submitter; + int result = UDS_ALLOCATE_EXTENDED(struct io_submitter, + thread_count, + struct bio_queue_data, + "bio submission data", + &io_submitter); + if (result != UDS_SUCCESS) { + return result; + } + + + io_submitter->bio_queue_rotation_interval = rotation_interval; + + /* Setup for each bio-submission work queue */ + for (i = 0; i < thread_count; i++) { + struct bio_queue_data *bio_queue_data = + &io_submitter->bio_queue_data[i]; + + mutex_init(&bio_queue_data->lock); + /* + * One I/O operation per request, but both first & + * last sector numbers. + * + * If requests are assigned to threads round-robin, + * they should be distributed quite evenly. But if + * they're assigned based on PBN, things can sometimes + * be very uneven. So for now, we'll assume that all + * requests *may* wind up on one thread, and thus all + * in the same map. + */ + result = make_int_map(max_requests_active * 2, 0, + &bio_queue_data->map); + if (result != 0) { + /* + * Clean up the partially initialized bio-queue + * entirely and indicate that initialization failed. + */ + uds_log_error("bio map initialization failed %d", + result); + vdo_cleanup_io_submitter(io_submitter); + vdo_free_io_submitter(io_submitter); + return result; + } + + bio_queue_data->queue_number = i; + result = vdo_make_thread(vdo, + vdo->thread_config->bio_threads[i], + &bio_queue_type, + 1, + (void **) &bio_queue_data); + if (result != VDO_SUCCESS) { + /* + * Clean up the partially initialized bio-queue + * entirely and indicate that initialization failed. + */ + free_int_map(UDS_FORGET(bio_queue_data->map)); + uds_log_error("bio queue initialization failed %d", + result); + vdo_cleanup_io_submitter(io_submitter); + vdo_free_io_submitter(io_submitter); + return result; + } + + bio_queue_data->queue + = vdo->threads[vdo->thread_config->bio_threads[i]].queue; + io_submitter->num_bio_queues_used++; + } + + *io_submitter_ptr = io_submitter; + + return VDO_SUCCESS; +} + +/** + * vdo_cleanup_io_submitter() - Tear down the io_submitter fields as needed + * for a physical layer. + * @io_submitter: The I/O submitter data to tear down (may be NULL). + */ +void vdo_cleanup_io_submitter(struct io_submitter *io_submitter) +{ + int i; + + if (io_submitter == NULL) { + return; + } + + for (i = io_submitter->num_bio_queues_used - 1; i >= 0; i--) { + finish_work_queue(io_submitter->bio_queue_data[i].queue); + } +} + +/** + * vdo_free_io_submitter() - Free the io_submitter fields and structure as + * needed. + * @io_submitter: The I/O submitter data to destroy. + * + * This must be called after vdo_cleanup_io_submitter(). It is used to + * release resources late in the shutdown process to avoid or reduce + * the chance of race conditions. + */ +void vdo_free_io_submitter(struct io_submitter *io_submitter) +{ + int i; + + if (io_submitter == NULL) { + return; + } + + for (i = io_submitter->num_bio_queues_used - 1; i >= 0; i--) { + io_submitter->num_bio_queues_used--; + /* + * vdo_destroy() will free the work queue, so just give up our + * reference to it. + */ + UDS_FORGET(io_submitter->bio_queue_data[i].queue); + free_int_map(UDS_FORGET(io_submitter->bio_queue_data[i].map)); + } + UDS_FREE(io_submitter); +} diff --git a/vdo/io-submitter.h b/vdo/io-submitter.h new file mode 100644 index 00000000..726f61ce --- /dev/null +++ b/vdo/io-submitter.h @@ -0,0 +1,63 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright Red Hat + */ + +#ifndef IO_SUBMITTER_H +#define IO_SUBMITTER_H + +#include + +#include "completion.h" +#include "kernel-types.h" +#include "vio.h" + +int vdo_make_io_submitter(unsigned int thread_count, + unsigned int rotation_interval, + unsigned int max_requests_active, + struct vdo *vdo, + struct io_submitter **io_submitter); + +void vdo_cleanup_io_submitter(struct io_submitter *io_submitter); + +void vdo_free_io_submitter(struct io_submitter *io_submitter); + +void process_vio_io(struct vdo_completion *completion); + +void submit_data_vio_io(struct data_vio *data_vio); + +void vdo_submit_metadata_io(struct vio *vio, + physical_block_number_t physical, + bio_end_io_t callback, + vdo_action *error_handler, + unsigned int operation, + char *data); + +static inline void submit_metadata_vio(struct vio *vio, + physical_block_number_t physical, + bio_end_io_t callback, + vdo_action *error_handler, + unsigned int operation) +{ + vdo_submit_metadata_io(vio, + physical, + callback, + error_handler, + operation, + vio->data); +} + +static inline void submit_flush_vio(struct vio *vio, + bio_end_io_t callback, + vdo_action *error_handler) +{ + /* FIXME: Can we just use REQ_OP_FLUSH? */ + vdo_submit_metadata_io(vio, + 0, + callback, + error_handler, + REQ_OP_WRITE | REQ_PREFLUSH, + NULL); +} + +#endif /* IO_SUBMITTER_H */ diff --git a/vdo/ioSubmitter.c b/vdo/ioSubmitter.c deleted file mode 100644 index 5567a74e..00000000 --- a/vdo/ioSubmitter.c +++ /dev/null @@ -1,634 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/kernel/ioSubmitter.c#36 $ - */ - -#include "ioSubmitter.h" - -#include - -#include "memoryAlloc.h" -#include "permassert.h" - -#include "atomicStats.h" -#include "bio.h" -#include "dataKVIO.h" -#include "logger.h" -#include "vdoInternal.h" - -/* - * Submission of bio operations to the underlying storage device will - * go through a separate work queue thread (or more than one) to - * prevent blocking in other threads if the storage device has a full - * queue. The plug structure allows that thread to do better batching - * of requests to make the I/O more efficient. - * - * When multiple worker threads are used, a thread is chosen for a - * I/O operation submission based on the PBN, so a given PBN will - * consistently wind up on the same thread. Flush operations are - * assigned round-robin. - * - * The map (protected by the mutex) collects pending I/O operations so - * that the worker thread can reorder them to try to encourage I/O - * request merging in the request queue underneath. - */ -struct bio_queue_data { - struct vdo_work_queue *queue; - struct blk_plug plug; - struct int_map *map; - struct mutex lock; - unsigned int queue_number; -}; - -struct io_submitter { - unsigned int num_bio_queues_used; - unsigned int bio_queue_rotation_interval; - unsigned int bio_queue_rotor; - struct bio_queue_data bio_queue_data[]; -}; - -/**********************************************************************/ -static void start_bio_queue(void *ptr) -{ - struct bio_queue_data *bio_queue_data = (struct bio_queue_data *) ptr; - - blk_start_plug(&bio_queue_data->plug); -} - -/**********************************************************************/ -static void finish_bio_queue(void *ptr) -{ - struct bio_queue_data *bio_queue_data = (struct bio_queue_data *) ptr; - - blk_finish_plug(&bio_queue_data->plug); -} - -static const struct vdo_work_queue_type bio_queue_type = { - .start = start_bio_queue, - .finish = finish_bio_queue, - .action_table = { - - { .name = "bio_compressed_data", - .code = BIO_Q_ACTION_COMPRESSED_DATA, - .priority = 0 }, - { .name = "bio_data", - .code = BIO_Q_ACTION_DATA, - .priority = 0 }, - { .name = "bio_flush", - .code = BIO_Q_ACTION_FLUSH, - .priority = 2 }, - { .name = "bio_high", - .code = BIO_Q_ACTION_HIGH, - .priority = 2 }, - { .name = "bio_metadata", - .code = BIO_Q_ACTION_METADATA, - .priority = 1 }, - { .name = "bio_verify", - .code = BIO_Q_ACTION_VERIFY, - .priority = 1 }, - }, -}; - -/** - * Check that we're running normally (i.e., not in an - * interrupt-servicing context) in an io_submitter bio thread. - **/ -static void assert_running_in_bio_queue(void) -{ - ASSERT_LOG_ONLY(!in_interrupt(), "not in interrupt context"); - ASSERT_LOG_ONLY(strnstr(current->comm, "bioQ", TASK_COMM_LEN) != NULL, - "running in bio submission work queue thread"); -} - -/** - * Returns the bio_queue_data pointer associated with the current thread. - * Results are undefined if called from any other thread. - * - * @return the bio_queue_data pointer - **/ -static inline struct bio_queue_data *get_current_bio_queue_data(void) -{ - struct bio_queue_data *bio_queue_data = - (struct bio_queue_data *) get_work_queue_private_data(); - // Does it look like a bio queue thread? - BUG_ON(bio_queue_data == NULL); - BUG_ON(bio_queue_data->queue != get_current_work_queue()); - return bio_queue_data; -} - -/**********************************************************************/ -static inline struct io_submitter * -bio_queue_to_submitter(struct bio_queue_data *bio_queue) -{ - struct bio_queue_data *first_bio_queue = bio_queue - - bio_queue->queue_number; - struct io_submitter *submitter = container_of(first_bio_queue, - struct io_submitter, - bio_queue_data[0]); - return submitter; -} - -/** - * Return the bio thread number handling the specified physical block - * number. -* - * @param io_submitter The I/O submitter data - * @param pbn The physical block number - * - * @return read cache zone number - **/ -static unsigned int bio_queue_number_for_pbn(struct io_submitter *io_submitter, - physical_block_number_t pbn) -{ - unsigned int bio_queue_index = - ((pbn % (io_submitter->num_bio_queues_used * - io_submitter->bio_queue_rotation_interval)) / - io_submitter->bio_queue_rotation_interval); - - return bio_queue_index; -} - -/** - * Check that we're running normally (i.e., not in an - * interrupt-servicing context) in an io_submitter bio thread. Also - * require that the thread we're running on is the correct one for the - * supplied physical block number. - * - * @param pbn The PBN that should have been used in thread selection - **/ -static void assert_running_in_bio_queue_for_pbn(physical_block_number_t pbn) -{ - struct bio_queue_data *this_queue; - struct io_submitter *submitter; - unsigned int computed_queue_number; - - assert_running_in_bio_queue(); - - this_queue = get_current_bio_queue_data(); - submitter = bio_queue_to_submitter(this_queue); - computed_queue_number = bio_queue_number_for_pbn(submitter, pbn); - ASSERT_LOG_ONLY(this_queue->queue_number == computed_queue_number, - "running in correct bio queue (%u vs %u) for PBN %llu", - this_queue->queue_number, - computed_queue_number, - pbn); -} - -/** - * Determines which bio counter to use - * - * @param vio the vio associated with the bio - * @param bio the bio to count - */ -static void count_all_bios(struct vio *vio, struct bio *bio) -{ - struct atomic_statistics *stats = &vio->vdo->stats; - if (is_data_vio(vio)) { - vdo_count_bios(&stats->bios_out, bio); - return; - } - - vdo_count_bios(&stats->bios_meta, bio); - if (vio->type == VIO_TYPE_RECOVERY_JOURNAL) { - vdo_count_bios(&stats->bios_journal, bio); - } else if (vio->type == VIO_TYPE_BLOCK_MAP) { - vdo_count_bios(&stats->bios_page_cache, bio); - } -} - -/** - * Update stats and tracing info, then submit the supplied bio to the - * OS for processing. - * - * @param vio The vio associated with the bio - * @param bio The bio to submit to the OS - **/ -static void send_bio_to_device(struct vio *vio, - struct bio *bio) -{ - assert_running_in_bio_queue_for_pbn(vio->physical); - atomic64_inc(&vio->vdo->stats.bios_submitted); - count_all_bios(vio, bio); - - bio_set_dev(bio, get_vdo_backing_device(vio->vdo)); -#if LINUX_VERSION_CODE < KERNEL_VERSION(5,9,0) - generic_make_request(bio); -#else - submit_bio_noacct(bio); -#endif -} - -/**********************************************************************/ -static sector_t get_bio_sector(struct bio *bio) -{ - return bio->bi_iter.bi_sector; -} - -/** - * Submits a bio to the underlying block device. May block if the - * device is busy. - * - * For normal data, vio->bios_merged is the list of all bios collected - * together in this group; all of them get submitted. - * - * @param item The work item in the vio "owning" the head of the - * bio_list to be submitted. - **/ -static void process_bio_map(struct vdo_work_item *item) -{ - struct vio *vio = work_item_as_vio(item); - assert_running_in_bio_queue(); - // XXX Should we call finish_bio_queue for the biomap case on old - // kernels? - if (is_data_vio(vio)) { - // We need to make sure to do two things here: - // 1. Use each bio's vio when submitting. Any other vio is - // not safe - // 2. Detach the bio list from the vio before submitting, - // because it could get reused/free'd up before all bios - // are submitted. - struct bio_queue_data *bio_queue_data = - get_work_queue_private_data(); - struct bio *bio = NULL; - - mutex_lock(&bio_queue_data->lock); - if (!bio_list_empty(&vio->bios_merged)) { - int_map_remove(bio_queue_data->map, - get_bio_sector(vio->bios_merged.head)); - int_map_remove(bio_queue_data->map, - get_bio_sector(vio->bios_merged.tail)); - } - - bio = vio->bios_merged.head; - bio_list_init(&vio->bios_merged); - mutex_unlock(&bio_queue_data->lock); - // Somewhere in the list we'll be submitting the current - // vio, so drop our handle on it now. - vio = NULL; - - while (bio != NULL) { - struct vio *vio_bio = bio->bi_private; - struct bio *next = bio->bi_next; - - bio->bi_next = NULL; - send_bio_to_device(vio_bio, - bio); - bio = next; - } - } else { - send_bio_to_device(vio, - vio->bio); - } -} - -/** - * This function will attempt to find an already queued bio that the current - * bio can be merged with. There are two types of merging possible, forward - * and backward, which are distinguished by a flag that uses kernel - * elevator terminology. - * - * @param map The bio map to use for merging - * @param vio The vio we want to merge - * @param back_merge Set to true for a back merge, false for a front merge - * - * @return the vio to merge to, NULL if no merging is possible - */ -static struct vio *get_mergeable_locked(struct int_map *map, - struct vio *vio, - bool back_merge) -{ - struct bio *bio = vio->bio; - sector_t merge_sector = get_bio_sector(bio); - struct vio *vio_merge; - - if (back_merge) { - merge_sector -= VDO_SECTORS_PER_BLOCK; - } else { - merge_sector += VDO_SECTORS_PER_BLOCK; - } - - vio_merge = int_map_get(map, merge_sector); - - if (vio_merge == NULL) { - return NULL; - } - - if (!are_work_item_actions_equal(work_item_from_vio(vio), - work_item_from_vio(vio_merge))) { - return NULL; - } - - if (bio_data_dir(bio) != bio_data_dir(vio_merge->bio)) { - return NULL; - } - - if (bio_list_empty(&vio_merge->bios_merged)) { - return NULL; - } - - if (back_merge) { - if (get_bio_sector(vio_merge->bios_merged.tail) != - merge_sector) { - return NULL; - } - } else if (get_bio_sector(vio_merge->bios_merged.head) != - merge_sector) { - return NULL; - } - - return vio_merge; -} - -/**********************************************************************/ -static inline unsigned int advance_bio_rotor(struct io_submitter *bio_data) -{ - unsigned int index = bio_data->bio_queue_rotor++ % - (bio_data->num_bio_queues_used * - bio_data->bio_queue_rotation_interval); - index /= bio_data->bio_queue_rotation_interval; - return index; -} - -/**********************************************************************/ -static int merge_to_prev_tail(struct int_map *bio_map, - struct vio *vio, - struct vio *prev_vio) -{ - int result; - int_map_remove(bio_map, get_bio_sector(prev_vio->bios_merged.tail)); - bio_list_merge(&prev_vio->bios_merged, &vio->bios_merged); - result = int_map_put(bio_map, - get_bio_sector(prev_vio->bios_merged.head), - prev_vio, true, NULL); - result = int_map_put(bio_map, - get_bio_sector(prev_vio->bios_merged.tail), - prev_vio, true, NULL); - return result; -} - -/**********************************************************************/ -static int merge_to_next_head(struct int_map *bio_map, - struct vio *vio, - struct vio *next_vio) -{ - int result; - - // Handle "next merge" and "gap fill" cases the same way so as to - // reorder bios in a way that's compatible with using funnel queues - // in work queues. This avoids removing an existing work item. - int_map_remove(bio_map, get_bio_sector(next_vio->bios_merged.head)); - bio_list_merge_head(&next_vio->bios_merged, &vio->bios_merged); - result = int_map_put(bio_map, - get_bio_sector(next_vio->bios_merged.head), - next_vio, true, NULL); - result = int_map_put(bio_map, - get_bio_sector(next_vio->bios_merged.tail), - next_vio, true, NULL); - return result; -} - -/**********************************************************************/ -static bool try_bio_map_merge(struct bio_queue_data *bio_queue_data, - struct vio *vio, - struct bio *bio) -{ - int result; - bool merged = false; - struct vio *prev_vio, *next_vio; - - mutex_lock(&bio_queue_data->lock); - prev_vio = get_mergeable_locked(bio_queue_data->map, vio, true); - next_vio = get_mergeable_locked(bio_queue_data->map, vio, false); - if (prev_vio == next_vio) { - next_vio = NULL; - } - - if ((prev_vio == NULL) && (next_vio == NULL)) { - // no merge. just add to bio_queue - result = int_map_put(bio_queue_data->map, get_bio_sector(bio), - vio, true, NULL); - // We don't care about failure of int_map_put in this case. - result = result; - mutex_unlock(&bio_queue_data->lock); - } else { - if (next_vio == NULL) { - // Only prev. merge to prev's tail - result = merge_to_prev_tail(bio_queue_data->map, - vio, prev_vio); - } else { - // Only next. merge to next's head - result = merge_to_next_head(bio_queue_data->map, - vio, next_vio); - } - - // We don't care about failure of int_map_put in this case. - result = result; - mutex_unlock(&bio_queue_data->lock); - merged = true; - } - - return merged; -} - -/**********************************************************************/ -static struct bio_queue_data * -bio_queue_data_for_pbn(struct io_submitter *io_submitter, - physical_block_number_t pbn) -{ - unsigned int bio_queue_index = - bio_queue_number_for_pbn(io_submitter, pbn); - return &io_submitter->bio_queue_data[bio_queue_index]; -} - -/**********************************************************************/ -void vdo_submit_bio(struct bio *bio, enum bio_q_action action) -{ - struct vio *vio = bio->bi_private; - struct bio_queue_data *bio_queue_data = - bio_queue_data_for_pbn(vio->vdo->io_submitter, vio->physical); - bool merged = false; - - setup_vio_work(vio, process_bio_map, bio->bi_end_io, action); - - bio->bi_next = NULL; - bio_list_init(&vio->bios_merged); - bio_list_add(&vio->bios_merged, bio); - - /* - * Try to use the bio map to submit this bio earlier if we're already - * sending IO for an adjacent block. If we can't use an existing - * pending bio, enqueue an operation to run in a bio submission thread - * appropriate to the indicated physical block number. - */ - - if (is_data_vio(vio)) { - merged = try_bio_map_merge(bio_queue_data, vio, bio); - } - if (!merged) { - enqueue_vio_work(bio_queue_data->queue, vio); - } -} - -/**********************************************************************/ -static int initialize_bio_queue(struct bio_queue_data *bio_queue_data, - const char *thread_name_prefix, - const char *queue_name, - unsigned int queue_number, - struct vdo *vdo) -{ - bio_queue_data->queue_number = queue_number; - - return make_work_queue(thread_name_prefix, - queue_name, - vdo, - bio_queue_data, - &bio_queue_type, - 1, - NULL, - &bio_queue_data->queue); -} - -/**********************************************************************/ -int make_vdo_io_submitter(const char *thread_name_prefix, - unsigned int thread_count, - unsigned int rotation_interval, - unsigned int max_requests_active, - struct vdo *vdo, - struct io_submitter **io_submitter_ptr) -{ - char queue_name[MAX_QUEUE_NAME_LEN]; - unsigned int i; - struct io_submitter *io_submitter; - int result = UDS_ALLOCATE_EXTENDED(struct io_submitter, - thread_count, - struct bio_queue_data, - "bio submission data", - &io_submitter); - if (result != UDS_SUCCESS) { - return result; - } - - - io_submitter->bio_queue_rotation_interval = rotation_interval; - - // Setup for each bio-submission work queue - for (i = 0; i < thread_count; i++) { - struct bio_queue_data *bio_queue_data = - &io_submitter->bio_queue_data[i]; - snprintf(queue_name, sizeof(queue_name), "bioQ%u", i); - - mutex_init(&bio_queue_data->lock); - /* - * One I/O operation per request, but both first & - * last sector numbers. - * - * If requests are assigned to threads round-robin, - * they should be distributed quite evenly. But if - * they're assigned based on PBN, things can sometimes - * be very uneven. So for now, we'll assume that all - * requests *may* wind up on one thread, and thus all - * in the same map. - */ - result = make_int_map(max_requests_active * 2, 0, - &bio_queue_data->map); - if (result != 0) { - // Clean up the partially initialized bio-queue - // entirely and indicate that initialization failed. - uds_log_error("bio map initialization failed %d", - result); - cleanup_vdo_io_submitter(io_submitter); - free_vdo_io_submitter(io_submitter); - return result; - } - - result = initialize_bio_queue(bio_queue_data, - thread_name_prefix, - queue_name, - i, - vdo); - if (result != VDO_SUCCESS) { - // Clean up the partially initialized bio-queue - // entirely and indicate that initialization failed. - free_int_map(UDS_FORGET(bio_queue_data->map)); - uds_log_error("bio queue initialization failed %d", - result); - cleanup_vdo_io_submitter(io_submitter); - free_vdo_io_submitter(io_submitter); - return result; - } - - io_submitter->num_bio_queues_used++; - } - - *io_submitter_ptr = io_submitter; - - return VDO_SUCCESS; -} - -/**********************************************************************/ -void cleanup_vdo_io_submitter(struct io_submitter *io_submitter) -{ - int i; - - if (io_submitter == NULL) { - return; - } - - for (i = io_submitter->num_bio_queues_used - 1; i >= 0; i--) { - finish_work_queue(io_submitter->bio_queue_data[i].queue); - } -} - -/**********************************************************************/ -void free_vdo_io_submitter(struct io_submitter *io_submitter) -{ - int i; - - if (io_submitter == NULL) { - return; - } - - for (i = io_submitter->num_bio_queues_used - 1; i >= 0; i--) { - io_submitter->num_bio_queues_used--; - free_work_queue(UDS_FORGET(io_submitter->bio_queue_data[i].queue)); - free_int_map(UDS_FORGET(io_submitter->bio_queue_data[i].map)); - } - UDS_FREE(io_submitter); -} - -/**********************************************************************/ -void vdo_dump_bio_work_queue(struct io_submitter *io_submitter) -{ - int i; - - for (i = 0; i < io_submitter->num_bio_queues_used; i++) { - dump_work_queue(io_submitter->bio_queue_data[i].queue); - } -} - - -/**********************************************************************/ -void vdo_enqueue_bio_work_item(struct io_submitter *io_submitter, - struct vdo_work_item *work_item) -{ - unsigned int bio_queue_index = advance_bio_rotor(io_submitter); - - enqueue_work_queue(io_submitter->bio_queue_data[bio_queue_index].queue, - work_item); -} diff --git a/vdo/ioSubmitter.h b/vdo/ioSubmitter.h deleted file mode 100644 index b3be9d40..00000000 --- a/vdo/ioSubmitter.h +++ /dev/null @@ -1,107 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/kernel/ioSubmitter.h#14 $ - */ - -#ifndef IOSUBMITTER_H -#define IOSUBMITTER_H - -#include "kernelLayer.h" -#include "kvio.h" - -/** - * Create an io_submitter structure. - * - * @param [in] thread_name_prefix The per-device prefix to use in process - * names - * @param [in] thread_count Number of bio-submission threads to set up - * @param [in] rotation_interval Interval to use when rotating between - * bio-submission threads when enqueuing work - * items - * @param [in] max_requests_active Number of bios for merge tracking - * @param [in] vdo The vdo which will use this submitter - * @param [out] io_submitter Pointer to the new data structure - * - * @return VDO_SUCCESS or an error - **/ -int make_vdo_io_submitter(const char *thread_name_prefix, - unsigned int thread_count, - unsigned int rotation_interval, - unsigned int max_requests_active, - struct vdo *vdo, - struct io_submitter **io_submitter); - -/** - * Tear down the io_submitter fields as needed for a physical layer. - * - * @param [in] io_submitter The I/O submitter data to tear down (may be NULL) - **/ -void cleanup_vdo_io_submitter(struct io_submitter *io_submitter); - -/** - * Free the io_submitter fields and structure as needed for a - * physical layer. This must be called after - * cleanup_vdo_io_submitter(). It is used to release resources late in - * the shutdown process to avoid or reduce the chance of race - * conditions. - * - * @param [in] io_submitter The I/O submitter data to destroy - **/ -void free_vdo_io_submitter(struct io_submitter *io_submitter); - -/** - * Dump info to the kernel log about the work queue used by the - * physical layer. For debugging only. - * - * @param [in] io_submitter The I/O submitter data - **/ -void vdo_dump_bio_work_queue(struct io_submitter *io_submitter); - - -/** - * Enqueue a work item to run in the work queue(s) used for bio - * submissions from the physical layer. - * - * Outside of io_submitter, used only for finishing processing of empty - * flush bios by sending them to the storage device. - * - * @param io_submitter The I/O submitter data to update - * @param work_item The new work item to run - **/ -void vdo_enqueue_bio_work_item(struct io_submitter *io_submitter, - struct vdo_work_item *work_item); - -/** - * Submit bio but don't block. - * - * Submits the bio to a helper work queue which sits in a loop - * submitting bios. The worker thread may block if the target device - * is busy, which is why we don't want to do the submission in the - * original calling thread. - * - * The bi_private field of the bio must point to a vio associated - * with the operation. The bi_end_io callback is invoked when the I/O - * operation completes. - * - * @param bio the block I/O operation descriptor to submit - * @param action the action code specifying the priority for the operation - **/ -void vdo_submit_bio(struct bio *bio, enum bio_q_action action); - -#endif // IOSUBMITTER_H diff --git a/vdo/journal-point.h b/vdo/journal-point.h new file mode 100644 index 00000000..7bffaa3f --- /dev/null +++ b/vdo/journal-point.h @@ -0,0 +1,128 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright Red Hat + */ + +#ifndef JOURNAL_POINT_H +#define JOURNAL_POINT_H + +#include "numeric.h" +#include "types.h" + +typedef uint16_t journal_entry_count_t; + +/* + * The absolute position of an entry in a recovery journal or slab journal. + */ +struct journal_point { + sequence_number_t sequence_number; + journal_entry_count_t entry_count; +}; + +/* + * A packed, platform-independent encoding of a struct journal_point. + */ +struct packed_journal_point { + /* + * The packed representation is the little-endian 64-bit representation + * of the low-order 48 bits of the sequence number, shifted up 16 bits, + * or'ed with the 16-bit entry count. + * + * Very long-term, the top 16 bits of the sequence number may not always + * be zero, as this encoding assumes--see BZ 1523240. + */ + __le64 encoded_point; +} __packed; + +/** + * vdo_advance_journal_point() - Move the given journal point forward by one + * entry. + * @point: The journal point to adjust. + * @entries_per_block: The number of entries in one full block. + */ +static inline void +vdo_advance_journal_point(struct journal_point *point, + journal_entry_count_t entries_per_block) +{ + point->entry_count++; + if (point->entry_count == entries_per_block) { + point->sequence_number++; + point->entry_count = 0; + } +} + +/** + * vdo_is_valid_journal_point() - Check whether a journal point is valid. + * @point: The journal point. + * + * Return: true if the journal point is valid. + */ +static inline bool +vdo_is_valid_journal_point(const struct journal_point *point) +{ + return ((point != NULL) && (point->sequence_number > 0)); +} + +/** + * vdo_before_journal_point() - Check whether the first point precedes the + * second point. + * @first: The first journal point. + * @second: The second journal point. + * + * Return: true if the first point precedes the second point. + */ +static inline bool vdo_before_journal_point(const struct journal_point *first, + const struct journal_point *second) +{ + return ((first->sequence_number < second->sequence_number) || + ((first->sequence_number == second->sequence_number) && + (first->entry_count < second->entry_count))); +} + +/** + * vdo_are_equivalent_journal_points() - Check whether the first point is the + * same as the second point. + * @first: The first journal point. + * @second: The second journal point. + * + * Return: true if both points reference the same logical position of an entry + * in the journal. + */ +static inline bool +vdo_are_equivalent_journal_points(const struct journal_point *first, + const struct journal_point *second) +{ + return ((first->sequence_number == second->sequence_number) && + (first->entry_count == second->entry_count)); +} + +/** + * vdo_pack_journal_point() - Encode the journal location represented by a + * journal_point into a packed_journal_point. + * @unpacked: The unpacked input point. + * @packed: The packed output point. + */ +static inline void vdo_pack_journal_point(const struct journal_point *unpacked, + struct packed_journal_point *packed) +{ + packed->encoded_point = __cpu_to_le64((unpacked->sequence_number << 16) + | unpacked->entry_count); +} + +/** + * vdo_unpack_journal_point() - Decode the journal location represented by a + * packed_journal_point into a journal_point. + * @packed: The packed input point. + * @unpacked: The unpacked output point. + */ +static inline void +vdo_unpack_journal_point(const struct packed_journal_point *packed, + struct journal_point *unpacked) +{ + uint64_t native = __le64_to_cpu(packed->encoded_point); + + unpacked->sequence_number = (native >> 16); + unpacked->entry_count = (native & 0xffff); +} + +#endif /* JOURNAL_POINT_H */ diff --git a/vdo/journalPoint.h b/vdo/journalPoint.h deleted file mode 100644 index c2b111a3..00000000 --- a/vdo/journalPoint.h +++ /dev/null @@ -1,147 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/journalPoint.h#7 $ - */ - -#ifndef JOURNAL_POINT_H -#define JOURNAL_POINT_H - -#include "numeric.h" -#include "types.h" - -typedef uint16_t journal_entry_count_t; - -/** - * The absolute position of an entry in a recovery journal or slab journal. - **/ -struct journal_point { - sequence_number_t sequence_number; - journal_entry_count_t entry_count; -}; - -/** - * A packed, platform-independent encoding of a struct journal_point. - **/ -struct packed_journal_point { - /** - * The packed representation is the little-endian 64-bit representation - * of the low-order 48 bits of the sequence number, shifted up 16 bits, - * or'ed with the 16-bit entry count. - * - * Very long-term, the top 16 bits of the sequence number may not always - * be zero, as this encoding assumes--see BZ 1523240. - **/ - __le64 encoded_point; -} __packed; - -/** - * Move the given journal point forward by one entry. - * - * @param point the journal point to adjust - * @param entries_per_block the number of entries in one full block - **/ -static inline void -advance_vdo_journal_point(struct journal_point *point, - journal_entry_count_t entries_per_block) -{ - point->entry_count++; - if (point->entry_count == entries_per_block) { - point->sequence_number++; - point->entry_count = 0; - } -} - -/** - * Check whether a journal point is valid. - * - * @param point the journal point - * - * @return true if the journal point is valid - **/ -static inline bool -is_valid_vdo_journal_point(const struct journal_point *point) -{ - return ((point != NULL) && (point->sequence_number > 0)); -} - -/** - * Check whether the first point precedes the second point. - * - * @param first the first journal point - * @param second the second journal point - - * - * @return true if the first point precedes the second point. - **/ -static inline bool before_vdo_journal_point(const struct journal_point *first, - const struct journal_point *second) -{ - return ((first->sequence_number < second->sequence_number) || - ((first->sequence_number == second->sequence_number) && - (first->entry_count < second->entry_count))); -} - -/** - * Check whether the first point is the same as the second point. - * - * @param first the first journal point - * @param second the second journal point - * - * @return true if both points reference the same logical - * position of an entry the journal - **/ -static inline bool -are_equivalent_vdo_journal_points(const struct journal_point *first, - const struct journal_point *second) -{ - return ((first->sequence_number == second->sequence_number) && - (first->entry_count == second->entry_count)); -} - -/** - * Encode the journal location represented by a journal_point into a - * packed_journal_point. - * - * @param unpacked The unpacked input point - * @param packed The packed output point - **/ -static inline void pack_vdo_journal_point(const struct journal_point *unpacked, - struct packed_journal_point *packed) -{ - packed->encoded_point = __cpu_to_le64((unpacked->sequence_number << 16) - | unpacked->entry_count); -} - -/** - * Decode the journal location represented by a packed_journal_point into a - * journal_point. - * - * @param packed The packed input point - * @param unpacked The unpacked output point - **/ -static inline void -unpack_vdo_journal_point(const struct packed_journal_point *packed, - struct journal_point *unpacked) -{ - uint64_t native = __le64_to_cpu(packed->encoded_point); - unpacked->sequence_number = (native >> 16); - unpacked->entry_count = (native & 0xffff); -} - -#endif // JOURNAL_POINT_H diff --git a/vdo/kernel-types.h b/vdo/kernel-types.h new file mode 100644 index 00000000..1ebd4a2d --- /dev/null +++ b/vdo/kernel-types.h @@ -0,0 +1,210 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright Red Hat + */ + +#ifndef KERNEL_TYPES_H +#define KERNEL_TYPES_H + +#include "types.h" + +#include + +/** + * typedef compressed_fragment_count_t - A count of compressed fragments. + */ +typedef uint8_t compressed_fragment_count_t; + +/** + * typedef page_size_t - The size of a page. + * + * Must be evenly divisible by block size. + */ +typedef uint32_t page_size_t; + +/** + * typedef thread_count_t - A thread counter. + */ +typedef uint8_t thread_count_t; + +/** + * typedef thread_id_t - A thread ID. + * + * Base-code threads are numbered sequentially starting from 0. + */ +typedef uint8_t thread_id_t; + +/* + * The thread ID returned when the current base code thread ID cannot be found + * or is otherwise undefined. + */ +static const thread_id_t VDO_INVALID_THREAD_ID = (thread_id_t) -1; + +/** + * typedef vio_count_t - A number of vios. + */ +typedef uint16_t vio_count_t; + +/* + * The type of request a data_vio is performing + */ +enum data_vio_operation_bits { + __DATA_VIO_READ, + __DATA_VIO_WRITE, + __DATA_VIO_FUA, +}; + +enum data_vio_operation { + DATA_VIO_UNSPECIFIED_OPERATION, + DATA_VIO_READ = (1 << __DATA_VIO_READ), + DATA_VIO_WRITE = (1 << __DATA_VIO_WRITE), + DATA_VIO_FUA = (1 << __DATA_VIO_FUA), +} __packed; + +#define DATA_VIO_READ_MODIFY_WRITE (DATA_VIO_READ | DATA_VIO_WRITE) +#define DATA_VIO_READ_WRITE_MASK DATA_VIO_READ_MODIFY_WRITE + +/* + * vio types for statistics and instrumentation. + */ +enum vio_type { + VIO_TYPE_UNINITIALIZED = 0, + VIO_TYPE_DATA, + VIO_TYPE_BLOCK_ALLOCATOR, + VIO_TYPE_BLOCK_MAP, + VIO_TYPE_BLOCK_MAP_INTERIOR, + VIO_TYPE_PARTITION_COPY, + VIO_TYPE_RECOVERY_JOURNAL, + VIO_TYPE_SLAB_JOURNAL, + VIO_TYPE_SLAB_SUMMARY, + VIO_TYPE_SUPER_BLOCK, + VIO_TYPE_TEST, +} __packed; + +/** + * vdo_is_data_vio_type() - Check whether a vio_type is for servicing an + * external data request. + * @type: The vio_type to check. + */ +static inline bool vdo_is_data_vio_type(enum vio_type type) +{ + return (type == VIO_TYPE_DATA); +} + +/** + * vdo_is_metadata_vio_type() - Check whether a vio_type is for metadata. + * @type: The vio_type to check. + */ +static inline bool vdo_is_metadata_vio_type(enum vio_type type) +{ + return ((type != VIO_TYPE_UNINITIALIZED) && + !vdo_is_data_vio_type(type)); +} + +enum vdo_completion_priority { + BIO_ACK_Q_ACK_PRIORITY = 0, + BIO_ACK_Q_MAX_PRIORITY = 0, + BIO_Q_COMPRESSED_DATA_PRIORITY = 0, + BIO_Q_DATA_PRIORITY = 0, + BIO_Q_FLUSH_PRIORITY = 2, + BIO_Q_HIGH_PRIORITY = 2, + BIO_Q_METADATA_PRIORITY = 1, + BIO_Q_VERIFY_PRIORITY = 1, + BIO_Q_MAX_PRIORITY = 2, + CPU_Q_COMPLETE_VIO_PRIORITY = 0, + CPU_Q_COMPLETE_READ_PRIORITY = 0, + CPU_Q_COMPRESS_BLOCK_PRIORITY = 0, + CPU_Q_EVENT_REPORTER_PRIORITY = 0, + CPU_Q_HASH_BLOCK_PRIORITY = 0, + CPU_Q_MAX_PRIORITY = 0, + UDS_Q_PRIORITY = 0, + UDS_Q_MAX_PRIORITY = 0, + VDO_DEFAULT_Q_COMPLETION_PRIORITY = 1, + VDO_DEFAULT_Q_FLUSH_PRIORITY = 2, + VDO_DEFAULT_Q_MAP_BIO_PRIORITY = 0, + VDO_DEFAULT_Q_SYNC_PRIORITY = 2, + VDO_DEFAULT_Q_VIO_CALLBACK_PRIORITY = 1, + VDO_DEFAULT_Q_MAX_PRIORITY = 2, + /* The maximum allowable priority */ + VDO_WORK_Q_MAX_PRIORITY = 3, + /* A value which must be out of range for a valid priority */ + VDO_WORK_Q_DEFAULT_PRIORITY = VDO_WORK_Q_MAX_PRIORITY + 1, +}; + +/* + * Priority levels for asynchronous I/O operations performed on a vio. + */ +enum vio_priority { + VIO_PRIORITY_LOW = 0, + VIO_PRIORITY_DATA = VIO_PRIORITY_LOW, + VIO_PRIORITY_COMPRESSED_DATA = VIO_PRIORITY_DATA, + VIO_PRIORITY_METADATA, + VIO_PRIORITY_HIGH, +} __packed; + +enum vdo_zone_type { + VDO_ZONE_TYPE_ADMIN, + VDO_ZONE_TYPE_JOURNAL, + VDO_ZONE_TYPE_LOGICAL, + VDO_ZONE_TYPE_PHYSICAL, +}; + +/* + * Forward declarations of abstract types + */ +struct action_manager; +struct allocation_selector; +struct atomic_bio_stats; +struct block_allocator; +struct block_map; +struct block_map_tree_zone; +struct block_map_zone; +struct data_vio; +struct data_vio_pool; +struct dedupe_context; +struct dedupe_index; +struct device_config; +struct flusher; +struct forest; +struct hash_lock; +struct hash_zone; +struct hash_zones; +struct index_config; +struct input_bin; +struct io_submitter; +struct lbn_lock; +struct lock_counter; +struct logical_zone; +struct logical_zones; +struct pbn_lock; +struct physical_zone; +struct physical_zones; +struct read_only_notifier; +struct recovery_journal; +struct ref_counts; +struct slab_depot; +struct slab_journal; +struct slab_journal_entry; +struct slab_scrubber; +struct slab_summary; +struct slab_summary_zone; +struct thread_config; +struct thread_count_config; +struct vdo; +struct vdo_completion; +struct vdo_flush; +struct vdo_layout; +struct vdo_slab; +struct vdo_statistics; +struct vdo_thread; +struct vdo_work_queue; +struct vio; +struct vio_pool; + +struct zoned_pbn { + physical_block_number_t pbn; + enum block_mapping_state state; + struct physical_zone *zone; +}; + +#endif /* KERNEL_TYPES_H */ diff --git a/vdo/kernelLayer.c b/vdo/kernelLayer.c deleted file mode 100644 index 42b17d94..00000000 --- a/vdo/kernelLayer.c +++ /dev/null @@ -1,801 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/kernel/kernelLayer.c#60 $ - */ - -#include "kernelLayer.h" - -#include -#include -#include -#include -#include -#include - -#include "logger.h" -#include "memoryAlloc.h" -#include "murmur/MurmurHash3.h" -#include "permassert.h" - -#include "adminCompletion.h" -#include "adminState.h" -#include "flush.h" -#include "releaseVersions.h" -#include "statistics.h" -#include "vdo.h" -#include "vdoLoad.h" -#include "vdoResize.h" -#include "vdoResizeLogical.h" -#include "vdoSuspend.h" -#include "volumeGeometry.h" - -#include "bio.h" -#include "dataKVIO.h" -#include "dedupeIndex.h" -#include "deviceConfig.h" -#include "deviceRegistry.h" -#include "instanceNumber.h" -#include "ioSubmitter.h" -#include "kvio.h" -#include "poolSysfs.h" -#include "stringUtils.h" -#include "vdoInit.h" - -static const struct vdo_work_queue_type bio_ack_q_type = { - .action_table = { - { - .name = "bio_ack", - .code = BIO_ACK_Q_ACTION_ACK, - .priority = 0 - }, - }, -}; - -static const struct vdo_work_queue_type cpu_q_type = { - .action_table = { - { - .name = "cpu_complete_vio", - .code = CPU_Q_ACTION_COMPLETE_VIO, - .priority = 0 - }, - { - .name = "cpu_compress_block", - .code = CPU_Q_ACTION_COMPRESS_BLOCK, - .priority = 0 - }, - { - .name = "cpu_hash_block", - .code = CPU_Q_ACTION_HASH_BLOCK, - .priority = 0 - }, - { - .name = "cpu_event_reporter", - .code = CPU_Q_ACTION_EVENT_REPORTER, - .priority = 0 - }, - }, -}; - -/**********************************************************************/ -static void set_kernel_layer_state(struct kernel_layer *layer, - enum kernel_layer_state new_state) -{ - smp_wmb(); - WRITE_ONCE(layer->state, new_state); -} - -/** - * Start processing a new data vio based on the supplied bio, but from within - * a VDO thread context, when we're not allowed to block. Using this path at - * all suggests a bug or erroneous usage, but we special-case it to avoid a - * deadlock that can apparently result. Message will be logged to alert the - * administrator that something has gone wrong, while we attempt to continue - * processing other requests. - * - * If a request permit can be acquired immediately, - * vdo_launch_data_vio_from_bio will be called. (If the bio is a discard - * operation, a permit from the discard limiter will be requested but the call - * will be made with or without it.) If the request permit is not available, - * the bio will be saved on a list to be launched later. Either way, this - * function will not block, and will take responsibility for processing the - * bio. - * - * @param vdo The vdo - * @param bio The bio to launch - * @param arrival_jiffies The arrival time of the bio - * - * @return DM_MAPIO_SUBMITTED or a system error code - **/ -static int launch_data_vio_from_vdo_thread(struct vdo *vdo, - struct bio *bio, - uint64_t arrival_jiffies) -{ - bool has_discard_permit; - int result; - - uds_log_warning("vdo_launch_bio called from within a VDO thread!"); - /* - * We're not yet entirely sure what circumstances are causing this - * situation in [ESC-638], but it does appear to be happening and - * causing VDO to deadlock. - * - * Somehow vdo_launch_bio is being called from generic_make_request - * which is being called from the VDO code to pass a flush on down to - * the underlying storage system; we've got 2000 requests in progress, - * so we have to wait for one to complete, but none can complete while - * the bio thread is blocked from passing more I/O requests down. Near - * as we can tell, the flush bio should always have gotten updated to - * point to the storage system, so we shouldn't be calling back into - * VDO unless something's gotten messed up somewhere. - * - * To side-step this case, if the limiter says we're busy *and* we're - * running on one of VDO's own threads, we'll drop the I/O request in a - * special queue for processing as soon as vios become free. - * - * We don't want to do this in general because it leads to unbounded - * buffering, arbitrarily high latencies, inability to push back in a - * way the caller can take advantage of, etc. If someone wants huge - * amounts of buffering on top of VDO, they're welcome to access it - * through the kernel page cache or roll their own. - */ - if (!limiter_poll(&vdo->request_limiter)) { - add_to_vdo_deadlock_queue(&vdo->deadlock_queue, - bio, - arrival_jiffies); - uds_log_warning("queued an I/O request to avoid deadlock!"); - - return DM_MAPIO_SUBMITTED; - } - - has_discard_permit = - ((bio_op(bio) == REQ_OP_DISCARD) && - limiter_poll(&vdo->discard_limiter)); - result = vdo_launch_data_vio_from_bio(vdo, - bio, - arrival_jiffies, - has_discard_permit); - // Succeed or fail, vdo_launch_data_vio_from_bio owns the permit(s) - // now. - if (result != VDO_SUCCESS) { - return result; - } - - return DM_MAPIO_SUBMITTED; -} - -/**********************************************************************/ -int vdo_launch_bio(struct vdo *vdo, struct bio *bio) -{ - int result; - uint64_t arrival_jiffies = jiffies; - struct vdo_work_queue *current_work_queue; - bool has_discard_permit = false; - const struct admin_state_code *code - = get_vdo_admin_state_code(&vdo->admin_state); - - ASSERT_LOG_ONLY(code->normal, - "vdo_launch_bio should not be called while in state %s", - code->name); - - // Count all incoming bios. - vdo_count_bios(&vdo->stats.bios_in, bio); - - - // Handle empty bios. Empty flush bios are not associated with a vio. - if ((bio_op(bio) == REQ_OP_FLUSH) || - ((bio->bi_opf & REQ_PREFLUSH) != 0)) { - launch_vdo_flush(vdo, bio); - return DM_MAPIO_SUBMITTED; - } - - current_work_queue = get_current_work_queue(); - - if ((current_work_queue != NULL) && - (vdo == get_work_queue_owner(current_work_queue))) { - /* - * This prohibits sleeping during I/O submission to VDO from - * its own thread. - */ - return launch_data_vio_from_vdo_thread(vdo, - bio, - arrival_jiffies); - } - - if (bio_op(bio) == REQ_OP_DISCARD) { - limiter_wait_for_one_free(&vdo->discard_limiter); - has_discard_permit = true; - } - limiter_wait_for_one_free(&vdo->request_limiter); - - result = vdo_launch_data_vio_from_bio(vdo, - bio, - arrival_jiffies, - has_discard_permit); - // Succeed or fail, vdo_launch_data_vio_from_bio owns the permit(s) - // now. - if (result != VDO_SUCCESS) { - return result; - } - - return DM_MAPIO_SUBMITTED; -} - -/**********************************************************************/ -void complete_many_requests(struct vdo *vdo, uint32_t count) -{ - // If we had to buffer some requests to avoid deadlock, release them - // now. - while (count > 0) { - bool has_discard_permit; - int result; - uint64_t arrival_jiffies = 0; - struct bio *bio = poll_vdo_deadlock_queue(&vdo->deadlock_queue, - &arrival_jiffies); - if (likely(bio == NULL)) { - break; - } - - has_discard_permit = - ((bio_op(bio) == REQ_OP_DISCARD) && - limiter_poll(&vdo->discard_limiter)); - result = vdo_launch_data_vio_from_bio(vdo, - bio, - arrival_jiffies, - has_discard_permit); - if (result != VDO_SUCCESS) { - vdo_complete_bio(bio, result); - } - // Succeed or fail, vdo_launch_data_vio_from_bio owns the - // permit(s) now. - count--; - } - // Notify the limiter, so it can wake any blocked processes. - if (count > 0) { - limiter_release_many(&vdo->request_limiter, count); - } -} - -/**********************************************************************/ -int make_kernel_layer(unsigned int instance, - struct device_config *config, - char **reason, - struct kernel_layer **layer_ptr) -{ - int result; - struct kernel_layer *layer; - char thread_name_prefix[MAX_QUEUE_NAME_LEN]; - - // VDO-3769 - Set a generic reason so we don't ever return garbage. - *reason = "Unspecified error"; - - /* - * Part 1 - Allocate the kernel layer, its essential parts, and set - * up the sysfs node. These must come first so that the sysfs node - * works correctly through the freeing of the kernel layer. After this - * part you must use free_kernel_layer. - */ - result = UDS_ALLOCATE(1, - struct kernel_layer, - "VDO configuration", - &layer); - if (result != UDS_SUCCESS) { - *reason = "Cannot allocate VDO"; - release_vdo_instance(instance); - return result; - } - - result = initialize_vdo(&layer->vdo, - config, - instance, - reason); - if (result != VDO_SUCCESS) { - return result; - } - - /* - * After this point, calling kobject_put on vdo->vdo_directory will - * decrement its reference count, and when the count goes to 0 the - * struct kernel_layer will be freed. - * - * Any error in this method from here on requires calling - * free_kernel_layer() before returning. - */ - - /* - * Part 2 - Do all the simple initialization. These initializations - * have no order dependencies and can be done in any order, but - * free_kernel_layer() cannot be called until all the simple layer - * properties are set. - * - * The kernel_layer structure starts as all zeros. Pointer - * initializations consist of replacing a NULL pointer with a non-NULL - * pointer, which can be easily undone by freeing all of the non-NULL - * pointers (using the proper free routine). - */ - snprintf(thread_name_prefix, - sizeof(thread_name_prefix), - "%s%u", - THIS_MODULE->name, - instance); - - result = make_batch_processor(&layer->vdo, - return_data_vio_batch_to_pool, - &layer->vdo, - &layer->vdo.data_vio_releaser); - if (result != UDS_SUCCESS) { - *reason = "Cannot allocate vio-freeing batch processor"; - free_kernel_layer(layer); - return result; - } - - // Dedupe Index - BUG_ON(thread_name_prefix[0] == '\0'); - result = make_vdo_dedupe_index(&layer->vdo.dedupe_index, - &layer->vdo, - thread_name_prefix); - if (result != UDS_SUCCESS) { - *reason = "Cannot initialize dedupe index"; - free_kernel_layer(layer); - return result; - } - - /* - * Part 3 - Do initializations that depend upon other previous - * initializations, but have no order dependencies at freeing time. - * Order dependencies for initialization are identified using BUG_ON. - */ - - - // Data vio pool - BUG_ON(layer->vdo.device_config->logical_block_size <= 0); - BUG_ON(layer->vdo.request_limiter.limit <= 0); - BUG_ON(layer->vdo.device_config->owned_device == NULL); - result = make_data_vio_buffer_pool(layer->vdo.request_limiter.limit, - &layer->vdo.data_vio_pool); - if (result != VDO_SUCCESS) { - *reason = "Cannot allocate vio data"; - free_kernel_layer(layer); - return result; - } - - /* - * Part 4 - Do initializations that depend upon other previous - * initialization, that may have order dependencies at freeing time. - * These are mostly starting up the workqueue threads. - */ - - // Base-code thread, etc - result = make_vdo_threads(&layer->vdo, thread_name_prefix, reason); - if (result != VDO_SUCCESS) { - free_kernel_layer(layer); - return result; - } - - // Bio queue - result = make_vdo_io_submitter(thread_name_prefix, - config->thread_counts.bio_threads, - config->thread_counts.bio_rotation_interval, - layer->vdo.request_limiter.limit, - &layer->vdo, - &layer->vdo.io_submitter); - if (result != VDO_SUCCESS) { - // If initialization of the bio-queues failed, they are cleaned - // up already, so just free the rest of the kernel layer. - free_kernel_layer(layer); - *reason = "bio submission initialization failed"; - return result; - } - - // Bio ack queue - if (use_bio_ack_queue(&layer->vdo)) { - result = make_work_queue(thread_name_prefix, - "ackQ", - &layer->vdo, - layer, - &bio_ack_q_type, - config->thread_counts.bio_ack_threads, - NULL, - &layer->vdo.bio_ack_queue); - if (result != VDO_SUCCESS) { - *reason = "bio ack queue initialization failed"; - free_kernel_layer(layer); - return result; - } - } - - // CPU Queues - result = make_work_queue(thread_name_prefix, - "cpuQ", - &layer->vdo, - layer, - &cpu_q_type, - config->thread_counts.cpu_threads, - (void **) layer->vdo.compression_context, - &layer->vdo.cpu_queue); - if (result != VDO_SUCCESS) { - *reason = "CPU queue initialization failed"; - free_kernel_layer(layer); - return result; - } - - *layer_ptr = layer; - return VDO_SUCCESS; -} - -/**********************************************************************/ -int prepare_to_modify_kernel_layer(struct kernel_layer *layer, - struct device_config *config, - char **error_ptr) -{ - int result; - struct device_config *extant_config = layer->vdo.device_config; - - if (config->owning_target->begin != - extant_config->owning_target->begin) { - *error_ptr = "Starting sector cannot change"; - return VDO_PARAMETER_MISMATCH; - } - - if (config->logical_block_size != extant_config->logical_block_size) { - *error_ptr = "Logical block size cannot change"; - return VDO_PARAMETER_MISMATCH; - } - - if (config->cache_size != extant_config->cache_size) { - *error_ptr = "Block map cache size cannot change"; - return VDO_PARAMETER_MISMATCH; - } - - if (config->block_map_maximum_age != - extant_config->block_map_maximum_age) { - *error_ptr = "Block map maximum age cannot change"; - return VDO_PARAMETER_MISMATCH; - } - - if (memcmp(&config->thread_counts, &extant_config->thread_counts, - sizeof(struct thread_count_config)) != 0) { - *error_ptr = "Thread configuration cannot change"; - return VDO_PARAMETER_MISMATCH; - } - - // Below here are the actions to take when a non-immutable property - // changes. - - if (config->owning_target->len != extant_config->owning_target->len) { - size_t logical_bytes = to_bytes(config->owning_target->len); - - if ((logical_bytes % VDO_BLOCK_SIZE) != 0) { - *error_ptr = "Logical size must be a multiple of 4096"; - return VDO_PARAMETER_MISMATCH; - } - - result = prepare_to_resize_logical( - layer, logical_bytes / VDO_BLOCK_SIZE); - if (result != VDO_SUCCESS) { - *error_ptr = "Device prepare_vdo_to_grow_logical failed"; - return result; - } - } - - if (config->physical_blocks != extant_config->physical_blocks) { - result = prepare_to_resize_physical(layer, - config->physical_blocks); - if (result != VDO_SUCCESS) { - if (result == VDO_TOO_MANY_SLABS) { - *error_ptr = "Device prepare_vdo_to_grow_physical failed (specified physical size too big based on formatted slab size)"; - } else { - *error_ptr = "Device prepare_vdo_to_grow_physical failed"; - } - return result; - } - } - - if (strcmp(config->parent_device_name, - extant_config->parent_device_name) != 0) { - const char *device_name - = get_vdo_device_name(config->owning_target); - uds_log_info("Updating backing device of %s from %s to %s", - device_name, - extant_config->parent_device_name, - config->parent_device_name); - - result = make_new_vdo_index_name(layer->vdo.dedupe_index, - config->parent_device_name, - &config->index_name); - if (result != VDO_SUCCESS) { - *error_ptr = "Allocating new index name failed"; - return result; - } - } - - return VDO_SUCCESS; -} - -/**********************************************************************/ -int modify_kernel_layer(struct kernel_layer *layer, - struct device_config *config) -{ - int result; - struct device_config *extant_config = layer->vdo.device_config; - enum kernel_layer_state state = get_kernel_layer_state(layer); - - if (state == LAYER_RUNNING) { - return VDO_SUCCESS; - } else if (state != LAYER_SUSPENDED) { - uds_log_error("pre-resume invoked while in unexpected kernel layer state %d", - state); - return -EINVAL; - } - set_kernel_layer_state(layer, LAYER_RESUMING); - - // A failure here is unrecoverable. So there is no problem if it - // happens. - - if (config->owning_target->len != extant_config->owning_target->len) { - size_t logical_bytes = to_bytes(config->owning_target->len); - result = resize_logical(layer, logical_bytes / VDO_BLOCK_SIZE); - if (result != VDO_SUCCESS) { - return result; - } - } - - // Grow physical if the version is 0, so we can't tell if we - // got an old-style growPhysical command, or if size changed. - if ((config->physical_blocks != extant_config->physical_blocks) || - (config->version == 0)) { - result = resize_physical(layer, config->physical_blocks); - if (result != VDO_SUCCESS) { - return result; - } - } - - return VDO_SUCCESS; -} - -/**********************************************************************/ -void free_kernel_layer(struct kernel_layer *layer) -{ - /* - * This is not the cleanest implementation, but given the current - * timing uncertainties in the shutdown process for work queues, we - * need to store information to enable a late-in-process deallocation - * of funnel-queue data structures in work queues. - */ - enum kernel_layer_state state = get_kernel_layer_state(layer); - const struct admin_state_code *code; - - switch (state) { - case LAYER_STOPPING: - uds_log_error("re-entered free_kernel_layer while stopping"); - break; - - case LAYER_RUNNING: - suspend_kernel_layer(layer); - fallthrough; - - case LAYER_STARTING: - case LAYER_RESUMING: - case LAYER_SUSPENDED: - stop_kernel_layer(layer); - fallthrough; - - case LAYER_STOPPED: - break; - - case LAYER_NEW: - code = get_vdo_admin_state_code(&layer->vdo.admin_state); - if ((code == VDO_ADMIN_STATE_NEW) - || (code == VDO_ADMIN_STATE_INITIALIZED) - || (code == VDO_ADMIN_STATE_PRE_LOADED)) { - break; - } - - uds_log_error("New kernel layer in unexpected state %s", - code->name); - break; - - default: - uds_log_error("Unknown Kernel Layer state: %d", state); - } - - destroy_vdo(&layer->vdo); -} - -/**********************************************************************/ -int start_kernel_layer(struct kernel_layer *layer, char **reason) -{ - int result; - const struct admin_state_code *code - = get_vdo_admin_state_code(&layer->vdo.admin_state); - - if (code != VDO_ADMIN_STATE_PRE_LOADED) { - *reason = "Cannot start kernel from non-starting state"; - stop_kernel_layer(layer); - return UDS_BAD_STATE; - } - - set_kernel_layer_state(layer, LAYER_STARTING); - result = load_vdo(&layer->vdo); - if ((result != VDO_SUCCESS) && (result != VDO_READ_ONLY)) { - *reason = "Cannot load metadata from device"; - stop_kernel_layer(layer); - return result; - } - - set_kernel_layer_state(layer, LAYER_RUNNING); - - if (layer->vdo.device_config->deduplication) { - // Don't try to load or rebuild the index first (and log - // scary error messages) if this is known to be a - // newly-formatted volume. - start_vdo_dedupe_index(layer->vdo.dedupe_index, - vdo_was_new(&layer->vdo)); - } - - layer->vdo.allocations_allowed = false; - return VDO_SUCCESS; -} - -/**********************************************************************/ -void stop_kernel_layer(struct kernel_layer *layer) -{ - switch (get_kernel_layer_state(layer)) { - case LAYER_RUNNING: - suspend_kernel_layer(layer); - fallthrough; - - case LAYER_SUSPENDED: - set_kernel_layer_state(layer, LAYER_STOPPING); - stop_vdo_dedupe_index(layer->vdo.dedupe_index); - fallthrough; - - case LAYER_STOPPING: - case LAYER_STOPPED: - default: - set_kernel_layer_state(layer, LAYER_STOPPED); - } -} - -/**********************************************************************/ -int suspend_kernel_layer(struct kernel_layer *layer) -{ - /* - * It's important to note any error here does not actually stop - * device-mapper from suspending the device. All this work is done - * post suspend. - */ - int result = suspend_vdo(&layer->vdo); - - if (result == VDO_INVALID_ADMIN_STATE) { - uds_log_error("Suspend invoked while in unexpected kernel layer state %d", - get_kernel_layer_state(layer)); - return -EINVAL; - } - - set_kernel_layer_state(layer, LAYER_SUSPENDED); - return result; -} - -/**********************************************************************/ -int resume_kernel_layer(struct kernel_layer *layer) -{ - int result; - - if (get_kernel_layer_state(layer) == LAYER_RUNNING) { - return VDO_SUCCESS; - } - - resume_vdo_dedupe_index(layer->vdo.dedupe_index, - layer->vdo.device_config); - result = resume_vdo(&layer->vdo); - if (result != VDO_SUCCESS) { - return result; - } - - set_kernel_layer_state(layer, LAYER_RUNNING); - return VDO_SUCCESS; -} - -/***********************************************************************/ -int prepare_to_resize_physical(struct kernel_layer *layer, - block_count_t physical_count) -{ - int result; - - uds_log_info("Preparing to resize physical to %llu", physical_count); - // Allocations are allowed and permissible through this non-VDO thread, - // since IO triggered by this allocation to VDO can finish just fine. - result = prepare_vdo_to_grow_physical(&layer->vdo, physical_count); - if (result != VDO_SUCCESS) { - // prepare_vdo_to_grow_physical logs errors. - if (result == VDO_PARAMETER_MISMATCH) { - /* - * If we don't trap this case, map_to_system_error() - * will remap it to -EIO, which is misleading and - * ahistorical. - */ - return -EINVAL; - } else { - return result; - } - } - - uds_log_info("Done preparing to resize physical"); - return VDO_SUCCESS; -} - -/***********************************************************************/ -int resize_physical(struct kernel_layer *layer, block_count_t physical_count) -{ - /* - * We must not mark the layer as allowing allocations when it is - * suspended lest an allocation attempt block on writing IO to the - * suspended VDO. - */ - int result = vdo_resize_physical(&layer->vdo, physical_count); - - if (result != VDO_SUCCESS) { - // vdo_resize_physical logs errors - return result; - } - return VDO_SUCCESS; -} - -/***********************************************************************/ -int prepare_to_resize_logical(struct kernel_layer *layer, - block_count_t logical_count) -{ - int result; - - uds_log_info("Preparing to resize logical to %llu", logical_count); - // Allocations are allowed and permissible through this non-VDO thread, - // since IO triggered by this allocation to VDO can finish just fine. - result = prepare_vdo_to_grow_logical(&layer->vdo, logical_count); - - if (result != VDO_SUCCESS) { - // prepare_vdo_to_grow_logical logs errors - return result; - } - - uds_log_info("Done preparing to resize logical"); - return VDO_SUCCESS; -} - -/***********************************************************************/ -int resize_logical(struct kernel_layer *layer, block_count_t logical_count) -{ - int result; - - uds_log_info("Resizing logical to %llu", logical_count); - /* - * We must not mark the VDO as allowing allocations when it is - * suspended lest an allocation attempt block on writing IO to the - * suspended VDO. - */ - result = vdo_resize_logical(&layer->vdo, logical_count); - - if (result != VDO_SUCCESS) { - // vdo_resize_logical logs errors - return result; - } - - uds_log_info("Logical blocks now %llu", logical_count); - return VDO_SUCCESS; -} - diff --git a/vdo/kernelLayer.h b/vdo/kernelLayer.h deleted file mode 100644 index 8045d58a..00000000 --- a/vdo/kernelLayer.h +++ /dev/null @@ -1,311 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/kernel/kernelLayer.h#29 $ - */ - -#ifndef KERNELLAYER_H -#define KERNELLAYER_H - -#include -#include -#include - -#include "constants.h" -#include "flush.h" -#include "intMap.h" -#include "types.h" -#include "vdo.h" -#include "vdoInternal.h" -#include "waitQueue.h" - -#include "batchProcessor.h" -#include "bufferPool.h" -#include "deadlockQueue.h" -#include "deviceConfig.h" -#include "kernelTypes.h" -#include "kernelVDO.h" -#include "limiter.h" -#include "statistics.h" -#include "workQueue.h" - -enum kernel_layer_state { - LAYER_NEW, - LAYER_STARTING, - LAYER_RUNNING, - LAYER_SUSPENDED, - LAYER_STOPPING, - LAYER_STOPPED, - LAYER_RESUMING, -}; - -/** - * The VDO representation of the target device - **/ -struct kernel_layer { - struct vdo vdo; - /** Accessed from multiple threads */ - enum kernel_layer_state state; -}; - -enum bio_q_action { - BIO_Q_ACTION_COMPRESSED_DATA, - BIO_Q_ACTION_DATA, - BIO_Q_ACTION_FLUSH, - BIO_Q_ACTION_HIGH, - BIO_Q_ACTION_METADATA, - BIO_Q_ACTION_VERIFY -}; - -enum cpu_q_action { - CPU_Q_ACTION_COMPLETE_VIO, - CPU_Q_ACTION_COMPRESS_BLOCK, - CPU_Q_ACTION_EVENT_REPORTER, - CPU_Q_ACTION_HASH_BLOCK, -}; - -enum bio_ack_q_action { - BIO_ACK_Q_ACTION_ACK, -}; - -/** - * Creates a kernel specific physical layer to be used by VDO - * - * @param instance Device instantiation counter - * @param config The device configuration - * @param reason The reason for any failure during this call - * @param layer_ptr A pointer to hold the created layer - * - * @return VDO_SUCCESS or an error - **/ -int __must_check -make_kernel_layer(unsigned int instance, - struct device_config *config, - char **reason, - struct kernel_layer **layer_ptr); - -/** - * Prepare to modify a kernel layer. - * - * @param layer The layer to modify - * @param config The new device configuration - * @param error_ptr A pointer to store the reason for any failure - * - * @return VDO_SUCCESS or an error - **/ -int __must_check -prepare_to_modify_kernel_layer(struct kernel_layer *layer, - struct device_config *config, - char **error_ptr); - -/** - * Modify a kernel physical layer. - * - * @param layer The layer to modify - * @param config The new device configuration - * - * @return VDO_SUCCESS or an error - **/ -int __must_check -modify_kernel_layer(struct kernel_layer *layer, struct device_config *config); - -/** - * Free a kernel physical layer. - * - * @param layer The layer, which must have been created by - * make_kernel_layer - **/ -void free_kernel_layer(struct kernel_layer *layer); - -/** - * Start the kernel layer. This method finishes bringing a VDO online now that - * a table is being resumed for the first time. - * - * @param layer The kernel layer - * @param reason The reason for any failure during this call - * - * @return VDO_SUCCESS or an error - **/ -int start_kernel_layer(struct kernel_layer *layer, char **reason); - -/** - * Stop the kernel layer. - * - * @param layer The kernel layer - **/ -void stop_kernel_layer(struct kernel_layer *layer); - -/** - * Suspend the kernel layer. - * - * @param layer The kernel layer - * - * @return VDO_SUCCESS or an error - **/ -int suspend_kernel_layer(struct kernel_layer *layer); - -/** - * Resume the kernel layer. - * - * @param layer The kernel layer - * - * @return VDO_SUCCESS or an error - **/ -int resume_kernel_layer(struct kernel_layer *layer); - -/** - * Get the kernel layer state. - * - * @param layer The kernel layer - * - * @return the instantaneously correct kernel layer state - **/ -static inline enum kernel_layer_state -get_kernel_layer_state(const struct kernel_layer *layer) -{ - enum kernel_layer_state state = READ_ONCE(layer->state); - smp_rmb(); - return state; -} - -/** - * Function call to begin processing a bio passed in from the block layer - * - * @param vdo The VDO instance - * @param bio The bio from the block layer - * - * @return value to return from the VDO map function. Either an error code - * or DM_MAPIO_REMAPPED or DM_MAPPED_SUBMITTED (see vdo_map_bio for - * details). - **/ -int vdo_launch_bio(struct vdo *vdo, struct bio *bio); - -/** - * Convert a struct vdo pointer to the kernel_layer contining it. - * - * @param vdo The vdo to convert - * - * @return The enclosing struct kernel_layer - **/ -static inline struct kernel_layer *vdo_as_kernel_layer(struct vdo *vdo) -{ - return container_of(vdo, struct kernel_layer, vdo); -} - -/** - * Convert a block number (or count) to a (512-byte-)sector number. - * - * The argument type is sector_t to force conversion to the type we - * want, although the actual values passed are of various integral - * types. It's just too easy to forget and do the multiplication - * without casting, resulting in 32-bit arithmetic that accidentally - * produces wrong results in devices over 2TB (2**32 sectors). - * - * @param block_number the block number/count - * - * @return the sector number/count - **/ -static inline sector_t block_to_sector(physical_block_number_t block_number) -{ - return (block_number * VDO_SECTORS_PER_BLOCK); -} - -/** - * Convert a sector number (or count) to a block number. Does not - * check to make sure the sector number is an integral number of - * blocks. - * - * @param sector_number the sector number/count - * - * @return the block number/count - **/ -static inline sector_t sector_to_block(sector_t sector_number) -{ - return (sector_number / VDO_SECTORS_PER_BLOCK); -} - -/** - * Convert a sector number to an offset within a block. - * - * @param sector_number the sector number - * - * @return the offset within the block - **/ -static inline block_size_t sector_to_block_offset(sector_t sector_number) -{ - unsigned int sectors_per_block_mask = VDO_SECTORS_PER_BLOCK - 1; - return to_bytes(sector_number & sectors_per_block_mask); -} - -/** - * Adjust parameters to prepare to use a larger physical space. - * The size must be larger than the current size. - * - * @param layer the kernel layer - * @param physical_count the new physical size in blocks - * - * @return VDO_SUCCESS or an error - */ -int prepare_to_resize_physical(struct kernel_layer *layer, - block_count_t physical_count); - -/** - * Adjusts parameters to reflect resizing the underlying device. - * The size must be larger than the current size. - * - * @param layer the kernel layer - * @param physical_count the new physical count in blocks - * - * @return VDO_SUCCESS or an error - */ -int resize_physical(struct kernel_layer *layer, block_count_t physical_count); - -/** - * Adjust parameters to prepare to present a larger logical space. - * The size must be larger than the current size. - * - * @param layer the kernel layer - * @param logical_count the new logical size in blocks - * - * @return VDO_SUCCESS or an error - */ -int prepare_to_resize_logical(struct kernel_layer *layer, - block_count_t logical_count); - -/** - * Adjust parameters to present a larger logical space. - * The size must be larger than the current size. - * - * @param layer the kernel layer - * @param logical_count the new logical size in blocks - * - * @return VDO_SUCCESS or an error - */ -int resize_logical(struct kernel_layer *layer, block_count_t logical_count); - -/** - * Update bookkeeping for the completion of some number of requests, so that - * more incoming requests can be accepted. - * - * @param vdo The vdo - * @param count The number of completed requests - **/ -void complete_many_requests(struct vdo *vdo, uint32_t count); - - -#endif /* KERNELLAYER_H */ diff --git a/vdo/kernelTypes.h b/vdo/kernelTypes.h deleted file mode 100644 index bac6c47a..00000000 --- a/vdo/kernelTypes.h +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/kernel/kernelTypes.h#18 $ - */ - -#ifndef KERNEL_TYPES_H -#define KERNEL_TYPES_H - -#include "types.h" - -struct atomic_bio_stats; -struct dedupe_context; -struct dedupe_index; -struct kernel_layer; -struct vdo_work_item; -struct vdo_work_queue; - -typedef void (*vdo_work_function)(struct vdo_work_item *work_item); - -#endif /* KERNEL_TYPES_H */ diff --git a/vdo/kernelVDO.c b/vdo/kernelVDO.c deleted file mode 100644 index 703f9426..00000000 --- a/vdo/kernelVDO.c +++ /dev/null @@ -1,272 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/kernel/kernelVDO.c#40 $ - */ - -/* - * Sadly, this include must precede the include of kernelVDOInternals.h because - * that file ends up including the uds version of errors.h which is wrong for - * this file. - */ -#include "errors.h" -#include "kernelVDOInternals.h" - -#include - -#include "memoryAlloc.h" -#include "permassert.h" - -#include "readOnlyNotifier.h" -#include "statistics.h" -#include "threadConfig.h" -#include "vdo.h" -#include "vdoLoad.h" -#include "vdoResize.h" -#include "vdoResizeLogical.h" -#include "vdoResume.h" -#include "vdoSuspend.h" - -#include "kernelLayer.h" -#include "kvio.h" -#include "logger.h" - -enum { PARANOID_THREAD_CONSISTENCY_CHECKS = 0 }; - -/**********************************************************************/ -static void start_vdo_request_queue(void *ptr) -{ - struct vdo_thread *thread = ptr; - struct vdo *vdo = thread->vdo; - uds_register_allocating_thread(&thread->allocating_thread, - &vdo->allocations_allowed); -} - -/**********************************************************************/ -static void finish_vdo_request_queue(void *ptr) -{ - uds_unregister_allocating_thread(); -} - -/**********************************************************************/ -static const struct vdo_work_queue_type request_queue_type = { - .start = start_vdo_request_queue, - .finish = finish_vdo_request_queue, - .action_table = { - - { .name = "req_completion", - .code = REQ_Q_ACTION_COMPLETION, - .priority = 1 }, - { .name = "req_flush", - .code = REQ_Q_ACTION_FLUSH, - .priority = 2 }, - { .name = "req_map_bio", - .code = REQ_Q_ACTION_MAP_BIO, - .priority = 0 }, - { .name = "req_sync", - .code = REQ_Q_ACTION_SYNC, - .priority = 2 }, - { .name = "req_vio_callback", - .code = REQ_Q_ACTION_VIO_CALLBACK, - .priority = 1 }, - }, -}; - -/**********************************************************************/ -int make_vdo_threads(struct vdo *vdo, - const char *thread_name_prefix, - char **reason) -{ - unsigned int base_threads = vdo->thread_config->base_thread_count; - int result = UDS_ALLOCATE(base_threads, - struct vdo_thread, - "request processing work queue", - &vdo->threads); - if (result != VDO_SUCCESS) { - *reason = "Cannot allocation thread structures"; - return result; - } - - for (vdo->initialized_thread_count = 0; - vdo->initialized_thread_count < base_threads; - vdo->initialized_thread_count++) { - int result; - struct vdo_thread *thread = - &vdo->threads[vdo->initialized_thread_count]; - char queue_name[MAX_QUEUE_NAME_LEN]; - - thread->vdo = vdo; - thread->thread_id = vdo->initialized_thread_count; - - // Copy only LEN - 1 bytes and ensure NULL termination. - vdo_get_thread_name(vdo->thread_config, - vdo->initialized_thread_count, - queue_name, - sizeof(queue_name)); - result = make_work_queue(thread_name_prefix, - queue_name, - vdo, - thread, - &request_queue_type, - 1, - NULL, - &thread->request_queue); - if (result != VDO_SUCCESS) { - *reason = "Cannot initialize request queue"; - while (vdo->initialized_thread_count > 0) { - unsigned int thread_to_destroy = - vdo->initialized_thread_count - 1; - thread = &vdo->threads[thread_to_destroy]; - finish_work_queue(thread->request_queue); - free_work_queue(UDS_FORGET(thread->request_queue)); - vdo->initialized_thread_count--; - } - - UDS_FREE(vdo->threads); - vdo->threads = NULL; - return result; - } - } - - return VDO_SUCCESS; -} - -/**********************************************************************/ -int resume_vdo(struct vdo *vdo) -{ - return perform_vdo_resume(vdo); -} - - -/**********************************************************************/ -void dump_vdo_work_queue(struct vdo *vdo) -{ - int i; - - for (i = 0; i < vdo->initialized_thread_count; i++) { - dump_work_queue(vdo->threads[i].request_queue); - } -} - -/**********************************************************************/ -int vdo_resize_physical(struct vdo *vdo, block_count_t physical_count) -{ - int result = perform_vdo_grow_physical(vdo, physical_count); - - if (result != VDO_SUCCESS) { - uds_log_error("resize operation failed, result = %d", result); - return result; - } - - return VDO_SUCCESS; -} - -/**********************************************************************/ -int vdo_resize_logical(struct vdo *vdo, block_count_t logical_count) -{ - int result = perform_vdo_grow_logical(vdo, logical_count); - - if (result != VDO_SUCCESS) { - uds_log_error("grow logical operation failed, result = %d", - result); - } - - return result; -} - -/**********************************************************************/ -void enqueue_vdo_thread_work(struct vdo_thread *thread, - struct vdo_work_item *item) -{ - enqueue_work_queue(thread->request_queue, item); -} - -/**********************************************************************/ -void enqueue_vdo_work(struct vdo *vdo, - struct vdo_work_item *item, - thread_id_t thread_id) -{ - enqueue_vdo_thread_work(&vdo->threads[thread_id], item); -} - -/**********************************************************************/ -void enqueue_vio(struct vio *vio, - vdo_work_function work, - void *stats_function, - unsigned int action) -{ - thread_id_t thread_id = vio_as_completion(vio)->callback_thread_id; - BUG_ON(thread_id >= vio->vdo->initialized_thread_count); - launch_vio(vio, - work, - stats_function, - action, - vio->vdo->threads[thread_id].request_queue); -} - -/**********************************************************************/ -static void vdo_enqueue_work(struct vdo_work_item *work_item) -{ - run_vdo_completion_callback(container_of(work_item, - struct vdo_completion, - work_item)); -} - -/**********************************************************************/ -void enqueue_vdo_completion(struct vdo_completion *completion) -{ - struct vdo *vdo = completion->vdo; - thread_id_t thread_id = completion->callback_thread_id; - - if (ASSERT(thread_id < vdo->initialized_thread_count, - "thread_id %u (completion type %d) is less than thread count %u", - thread_id, - completion->type, - vdo->initialized_thread_count) != UDS_SUCCESS) { - BUG(); - } - - setup_work_item(&completion->work_item, vdo_enqueue_work, - completion->callback, - REQ_Q_ACTION_COMPLETION); - enqueue_vdo_thread_work(&vdo->threads[thread_id], - &completion->work_item); -} - -/**********************************************************************/ -thread_id_t vdo_get_callback_thread_id(void) -{ - struct vdo_thread *thread = get_work_queue_private_data(); - thread_id_t thread_id; - - if (thread == NULL) { - return VDO_INVALID_THREAD_ID; - } - - thread_id = thread->thread_id; - - if (PARANOID_THREAD_CONSISTENCY_CHECKS) { - struct vdo *vdo = thread->vdo; - struct kernel_layer *kernel_layer = vdo_as_kernel_layer(vdo); - BUG_ON(&kernel_layer->vdo != vdo); - BUG_ON(thread_id >= vdo->initialized_thread_count); - BUG_ON(thread != &vdo->threads[thread_id]); - } - - return thread_id; -} diff --git a/vdo/kernelVDO.h b/vdo/kernelVDO.h deleted file mode 100644 index dc89abe6..00000000 --- a/vdo/kernelVDO.h +++ /dev/null @@ -1,127 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/kernel/kernelVDO.h#20 $ - */ - -#ifndef KERNEL_VDO_H -#define KERNEL_VDO_H - -#include "completion.h" -#include "types.h" -#include "vdo.h" -#include "vdoInternal.h" - -#include "kernelTypes.h" -#include "workQueue.h" - -enum { - REQ_Q_ACTION_COMPLETION, - REQ_Q_ACTION_FLUSH, - REQ_Q_ACTION_MAP_BIO, - REQ_Q_ACTION_SYNC, - REQ_Q_ACTION_VIO_CALLBACK -}; - -/** - * Make base threads. - * - * @param [in] vdo The vdo to be initialized - * @param [in] thread_name_prefix The per-device prefix to use in thread - * names - * @param [out] reason The reason for failure - * - * @return VDO_SUCCESS or an error code - **/ -int make_vdo_threads(struct vdo *vdo, - const char *thread_name_prefix, - char **reason); - -/** - * Resume the base VDO instance associated with the kernel layer. - * - * @param vdo The vdo to be resumed - * - * @return VDO_SUCCESS or an error - **/ -int resume_vdo(struct vdo *vdo); - - -/** - * Dump to the kernel log any work-queue info associated with the base code. - * - * @param vdo The vdo object to be examined - **/ -void dump_vdo_work_queue(struct vdo *vdo); - -/** - * Gets the latest statistics gathered by the base code. - * - * @param vdo the vdo object - * @param stats the statistics struct to fill in - **/ -void get_kvdo_statistics(struct vdo *vdo, - struct vdo_statistics *stats); - -/** - * Notify the base code of resized physical storage. - * - * @param vdo The vdo to be updated - * @param physical_count The new size - * - * @return VDO_SUCCESS or error - **/ -int vdo_resize_physical(struct vdo *vdo, block_count_t physical_count); - -/** - * Request the base code grow the logical space. - * - * @param vdo The vdo to be updated - * @param logical_count The new size - * - * @return VDO_SUCCESS or error - **/ -int vdo_resize_logical(struct vdo *vdo, block_count_t logical_count); - - -/** - * Enqueue a work item to be processed in the base code context. - * - * @param vdo The vdo object in which to run the work item - * @param item The work item to be run - * @param thread_id The thread on which to run the work item - **/ -void enqueue_vdo_work(struct vdo *vdo, - struct vdo_work_item *item, - thread_id_t thread_id); - -/** - * Set up and enqueue a vio's work item to be processed in the base code - * context. - * - * @param vio The vio with the work item to be run - * @param work The function pointer to execute - * @param stats_function A function pointer to record for stats, or NULL - * @param action Action code, mapping to a relative priority - **/ -void enqueue_vio(struct vio *vio, - vdo_work_function work, - void *stats_function, - unsigned int action); - -#endif // KERNEL_VDO_H diff --git a/vdo/kernelVDOInternals.h b/vdo/kernelVDOInternals.h deleted file mode 100644 index e8ae99db..00000000 --- a/vdo/kernelVDOInternals.h +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/kernel/kernelVDOInternals.h#2 $ - */ - -#ifndef KERNEL_VDO_INTERNALS_H -#define KERNEL_VDO_INTERNALS_H - -#include "kernelVDO.h" - -/** - * Enqueue a work item to be performed in the base code in a particular thread. - * - * @param thread The VDO thread on which to run the work item - * @param item The work item to be run - **/ -void enqueue_vdo_thread_work(struct vdo_thread *thread, - struct vdo_work_item *item); - -#endif // KERNEL_VDO_INTERNALS_H diff --git a/vdo/kvio.c b/vdo/kvio.c deleted file mode 100644 index e9db4b23..00000000 --- a/vdo/kvio.c +++ /dev/null @@ -1,167 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/kernel/kvio.c#24 $ - */ - -#include "kvio.h" - - -#include "logger.h" -#include "memoryAlloc.h" -#include "permassert.h" - -#include "numUtils.h" -#include "vdo.h" -#include "vdoInternal.h" -#include "waitQueue.h" - -#include "bio.h" -#include "dataKVIO.h" -#include "ioSubmitter.h" - -/** - * A function to tell vdo that we have completed the requested async - * operation for a vio. - * - * @param item The work item of the vio to complete - **/ -static void vdo_handle_vio_callback(struct vdo_work_item *item) -{ - run_vdo_completion_callback(container_of(item, struct vdo_completion, - work_item)); -} - -/**********************************************************************/ -void enqueue_vio_callback(struct vio *vio) -{ - enqueue_vio(vio, - vdo_handle_vio_callback, - vio_as_completion(vio)->callback, - REQ_Q_ACTION_VIO_CALLBACK); -} - -/**********************************************************************/ -void continue_vio(struct vio *vio, int error) -{ - if (unlikely(error != VDO_SUCCESS)) { - set_vdo_completion_result(vio_as_completion(vio), error); - } - - enqueue_vio_callback(vio); -} - -/**********************************************************************/ -void write_compressed_block_vio(struct vio *vio) -{ - // This method assumes that compressed writes never set the flush or - // FUA bits. - struct bio *bio = vio->bio; - int result = ASSERT(is_compressed_write_vio(vio), - "Compressed write vio has correct type"); - if (result != VDO_SUCCESS) { - continue_vio(vio, result); - return; - } - - // Write the compressed block, using the compressed vio's own bio. - result = vdo_reset_bio_with_buffer(bio, - vio->data, - vio, - vdo_complete_async_bio, - REQ_OP_WRITE, - vio->physical); - if (result != VDO_SUCCESS) { - continue_vio(vio, result); - return; - } - - vdo_submit_bio(bio, BIO_Q_ACTION_COMPRESSED_DATA); -} - -/** - * Get the bio queue action for a metadata vio based on that vio's priority. - * - * @param vio The vio - * - * @return The action with which to submit the vio's bio. - **/ -static inline enum bio_q_action get_metadata_action(struct vio *vio) -{ - return ((vio->priority == VIO_PRIORITY_HIGH) ? BIO_Q_ACTION_HIGH : - BIO_Q_ACTION_METADATA); -} - -/**********************************************************************/ -void submit_metadata_vio(struct vio *vio) -{ - int result; - char *data = vio->data; - struct bio *bio = vio->bio; - unsigned int bi_opf; - if (is_read_vio(vio)) { - ASSERT_LOG_ONLY(!vio_requires_flush_before(vio), - "read vio does not require flush before"); - bi_opf = REQ_OP_READ; - } else if (vio_requires_flush_before(vio)) { - bi_opf = REQ_OP_WRITE | REQ_PREFLUSH; - } else { - bi_opf = REQ_OP_WRITE; - } - - if (vio_requires_flush_after(vio)) { - bi_opf |= REQ_FUA; - } - - /* - * Everything coming through this function is metadata, so flag it as - * REQ_META in case the lower layers benefit from that information. - * - * We believe all recovery journal and block map IO is important for - * throughput relative to other IO, so we tag them with REQ_PRIO to - * convey this to lower layers, if they care. - * - * Additionally, recovery journal IO is directly critical to user - * bio latency, so we tag them with REQ_SYNC. - **/ - bi_opf |= REQ_META; - if ((vio->type == VIO_TYPE_BLOCK_MAP_INTERIOR) || - (vio->type == VIO_TYPE_BLOCK_MAP) || - (vio->type == VIO_TYPE_RECOVERY_JOURNAL)) { - bi_opf |= REQ_PRIO; - } - - if (vio->type == VIO_TYPE_RECOVERY_JOURNAL) { - bi_opf |= REQ_SYNC; - } - - if (is_empty_flush_vio(vio)) { - data = NULL; - } - - result = vdo_reset_bio_with_buffer(bio, data, vio, - vdo_complete_async_bio, bi_opf, - vio->physical); - if (result != VDO_SUCCESS) { - continue_vio(vio, result); - return; - } - - // Perform the metadata IO, using the metadata vio's own bio. - vdo_submit_bio(bio, get_metadata_action(vio)); -} diff --git a/vdo/kvio.h b/vdo/kvio.h deleted file mode 100644 index c187fe8f..00000000 --- a/vdo/kvio.h +++ /dev/null @@ -1,95 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/kernel/kvio.h#7 $ - */ - -#ifndef KVIO_H -#define KVIO_H - -#include "vio.h" - -#include "kernelLayer.h" -#include "workQueue.h" - -/** - * Enqueue a vio on a work queue. - * - * @param queue The queue - * @param vio The vio - **/ -static inline void enqueue_vio_work(struct vdo_work_queue *queue, - struct vio *vio) -{ - enqueue_work_queue(queue, work_item_from_vio(vio)); -} - -/** - * Set up the work item for a vio. - * - * @param vio The vio to set up - * @param work The function pointer to execute - * @param stats_function A function pointer to record for stats, or NULL - * @param action Action code, mapping to a relative priority - **/ -static inline void setup_vio_work(struct vio *vio, - vdo_work_function work, - void *stats_function, - unsigned int action) -{ - setup_work_item(work_item_from_vio(vio), - work, - stats_function, - action); -} - -/** - * Set up and enqueue a vio. - * - * @param vio The vio to set up - * @param work The function pointer to execute - * @param stats_function A function pointer to record for stats, or NULL - * @param action Action code, mapping to a relative priority - * @param queue The queue on which to enqueue the kvio - **/ -static inline void launch_vio(struct vio *vio, - vdo_work_function work, - void *stats_function, - unsigned int action, - struct vdo_work_queue *queue) -{ - setup_vio_work(vio, work, stats_function, action); - enqueue_vio_work(queue, vio); -} - -/** - * Move a vio back to the base threads. - * - * @param vio The vio to enqueue - **/ -void enqueue_vio_callback(struct vio *vio); - -/** - * Handles vio-related I/O post-processing. - * - * @param vio The vio to finalize - * @param error Possible error - **/ -void continue_vio(struct vio *vio, int error); - -#endif /* KVIO_H */ diff --git a/vdo/limiter.c b/vdo/limiter.c deleted file mode 100644 index ff3a1765..00000000 --- a/vdo/limiter.c +++ /dev/null @@ -1,146 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/limiter.c#9 $ - */ - -#include "limiter.h" - -#include - -/**********************************************************************/ -void initialize_limiter(struct limiter *limiter, uint32_t limit) -{ - limiter->active = 0; - limiter->limit = limit; - limiter->maximum = 0; - init_waitqueue_head(&limiter->waiter_queue); - spin_lock_init(&limiter->lock); -} - -/**********************************************************************/ -bool limiter_is_idle(struct limiter *limiter) -{ - bool idle; - spin_lock(&limiter->lock); - idle = (limiter->active == 0); - spin_unlock(&limiter->lock); - return idle; -} - -/**********************************************************************/ -void limiter_release_many(struct limiter *limiter, uint32_t count) -{ - struct vdo_completion *completion = NULL; - - spin_lock(&limiter->lock); - WRITE_ONCE(limiter->active, limiter->active - count); - if (limiter->active == 0) { - completion = limiter->completion; - } - spin_unlock(&limiter->lock); - - if (waitqueue_active(&limiter->waiter_queue)) { - wake_up_nr(&limiter->waiter_queue, count); - return; - } - - if (completion == NULL) { - return; - } - - // Only take the lock a second time if we are releasing the completion. - spin_lock(&limiter->lock); - limiter->completion = NULL; - spin_unlock(&limiter->lock); - - complete_vdo_completion(completion); -} - -/**********************************************************************/ -void drain_vdo_limiter(struct limiter *limiter, - struct vdo_completion *completion) -{ - bool finished = false; - - spin_lock(&limiter->lock); - if (limiter->active == 0) { - finished = true; - } else if (limiter->completion == NULL) { - limiter->completion = completion; - } else { - set_vdo_completion_result(completion, VDO_COMPONENT_BUSY); - finished = true; - } - spin_unlock(&limiter->lock); - - if (finished) { - complete_vdo_completion(completion); - } -} - -/** - * Take one permit from the limiter, if one is available, and update - * the maximum active count if appropriate. - * - * The limiter's lock must already be locked. - * - * @param limiter The limiter to update - * - * @return true iff the permit was acquired - **/ -static bool take_permit_locked(struct limiter *limiter) -{ - if (limiter->active >= limiter->limit) { - return false; - } - WRITE_ONCE(limiter->active, limiter->active + 1); - if (limiter->active > limiter->maximum) { - WRITE_ONCE(limiter->maximum, limiter->active); - } - return true; -} - -/**********************************************************************/ -void limiter_wait_for_one_free(struct limiter *limiter) -{ - spin_lock(&limiter->lock); - while (!take_permit_locked(limiter)) { - DEFINE_WAIT(wait); - - prepare_to_wait_exclusive(&limiter->waiter_queue, - &wait, - TASK_UNINTERRUPTIBLE); - spin_unlock(&limiter->lock); - io_schedule(); - spin_lock(&limiter->lock); - finish_wait(&limiter->waiter_queue, &wait); - }; - spin_unlock(&limiter->lock); -} - -/**********************************************************************/ -bool limiter_poll(struct limiter *limiter) -{ - bool acquired; - spin_lock(&limiter->lock); - acquired = take_permit_locked(limiter); - - spin_unlock(&limiter->lock); - return acquired; -} diff --git a/vdo/limiter.h b/vdo/limiter.h deleted file mode 100644 index 5bb9d828..00000000 --- a/vdo/limiter.h +++ /dev/null @@ -1,114 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/limiter.h#9 $ - */ - -#ifndef LIMITER_H -#define LIMITER_H - -#include - -#include "completion.h" - -/* - * A limiter is a fancy counter used to limit resource usage. We have a - * limit to number of resources that we are willing to use, and a limiter - * holds us to that limit. - */ - -struct limiter { - // A spinlock controlling access to the contents of this struct - spinlock_t lock; - // The queue of threads waiting for a resource to become available - wait_queue_head_t waiter_queue; - // The number of resources in use - uint32_t active; - // The maximum number number of resources that have ever been in use - uint32_t maximum; - // The limit to the number of resources that are allowed to be used - uint32_t limit; - // A completion waiting for the limiter to become idle - struct vdo_completion *completion; -}; - -/** - * Initialize a limiter structure - * - * @param limiter The limiter - * @param limit The limit to the number of active resources - **/ -void initialize_limiter(struct limiter *limiter, uint32_t limit); - -/** - * Determine whether there are any active resources - * - * @param limiter The limiter - * - * @return true if there are no active resources - **/ -bool limiter_is_idle(struct limiter *limiter); - -/** - * Release resources, making them available for other uses - * - * @param limiter The limiter - * @param count The number of resources to release - **/ -void limiter_release_many(struct limiter *limiter, uint32_t count); - -/** - * Release one resource, making it available for another use - * - * @param limiter The limiter - **/ -static inline void limiter_release(struct limiter *limiter) -{ - limiter_release_many(limiter, 1); -} - -/** - * Wait asynchronously for there to be no active resources. - * - * @param limiter The limiter - * @param completion The completion to notify when the limiter is idle - **/ -void drain_vdo_limiter(struct limiter *limiter, - struct vdo_completion *completion); - -/** - * Prepare to start using one resource, waiting if there are too many resources - * already in use. After returning from this routine, the caller may use the - * resource, and must call limiter_release after freeing the resource. - * - * @param limiter The limiter - **/ -void limiter_wait_for_one_free(struct limiter *limiter); - -/** - * Attempt to reserve one resource, without waiting. After returning from this - * routine, if allocation was successful, the caller may use the resource, and - * must call limiter_release after freeing the resource. - * - * @param limiter The limiter - * - * @return true iff the resource was allocated - **/ -bool limiter_poll(struct limiter *limiter); - -#endif /* LIMITER_H */ diff --git a/vdo/linux/murmurhash3.h b/vdo/linux/murmurhash3.h new file mode 100644 index 00000000..9e0c0ba0 --- /dev/null +++ b/vdo/linux/murmurhash3.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ +/* + * MurmurHash3 was written by Austin Appleby, and is placed in the public + * domain. The author hereby disclaims copyright to this source code. + */ + +#ifndef _MURMURHASH3_H_ +#define _MURMURHASH3_H_ + +#include + +void murmurhash3_128(const void *key, int len, uint32_t seed, void *out); + +#endif /* _MURMURHASH3_H_ */ diff --git a/vdo/lockCounter.c b/vdo/lock-counter.c similarity index 52% rename from vdo/lockCounter.c rename to vdo/lock-counter.c index 23d33aab..9eb33ba6 100644 --- a/vdo/lockCounter.c +++ b/vdo/lock-counter.c @@ -1,34 +1,20 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/lockCounter.c#17 $ */ -#include "lockCounter.h" +#include "lock-counter.h" #include -#include "memoryAlloc.h" +#include "memory-alloc.h" #include "permassert.h" #include "vdo.h" /** + * DOC: + * * A lock_counter is intended to keep all of the locks for the blocks in the * recovery journal. The per-zone counters are all kept in a single array which * is arranged by zone (i.e. zone 0's lock 0 is at index 0, zone 0's lock 1 is @@ -42,9 +28,10 @@ * * Lock sets are laid out with the set for recovery journal first, followed by * the logical zones, and then the physical zones. - **/ + */ + enum lock_counter_state { - LOCK_COUNTER_STATE_NOT_NOTIFYING = 0, + LOCK_COUNTER_STATE_NOT_NOTIFYING, LOCK_COUNTER_STATE_NOTIFYING, LOCK_COUNTER_STATE_SUSPENDED, }; @@ -74,8 +61,21 @@ struct lock_counter { uint16_t *physical_counters; }; -/**********************************************************************/ -int make_vdo_lock_counter(struct vdo *vdo, +/** + * vdo_make_lock_counter() - Create a lock counter. + * + * @vdo: The VDO. + * @parent: The parent to notify when the lock count goes to zero. + * @callback: The function to call when the lock count goes to zero. + * @thread_id: The id of thread on which to run the callback. + * @logical_zones: The total number of logical zones. + * @physical_zones: The total number of physical zones. + * @locks: The number of locks. + * @lock_counter_ptr: A pointer to hold the new counter. + * + * Return: VDO_SUCCESS or an error. + */ +int vdo_make_lock_counter(struct vdo *vdo, void *parent, vdo_action callback, thread_id_t thread_id, @@ -87,6 +87,7 @@ int make_vdo_lock_counter(struct vdo *vdo, struct lock_counter *lock_counter; int result = UDS_ALLOCATE(1, struct lock_counter, __func__, &lock_counter); + if (result != VDO_SUCCESS) { return result; } @@ -94,48 +95,48 @@ int make_vdo_lock_counter(struct vdo *vdo, result = UDS_ALLOCATE(locks, uint16_t, __func__, &lock_counter->journal_counters); if (result != VDO_SUCCESS) { - free_vdo_lock_counter(lock_counter); + vdo_free_lock_counter(lock_counter); return result; } result = UDS_ALLOCATE(locks, atomic_t, __func__, &lock_counter->journal_decrement_counts); if (result != VDO_SUCCESS) { - free_vdo_lock_counter(lock_counter); + vdo_free_lock_counter(lock_counter); return result; } result = UDS_ALLOCATE(locks * logical_zones, uint16_t, __func__, &lock_counter->logical_counters); if (result != VDO_SUCCESS) { - free_vdo_lock_counter(lock_counter); + vdo_free_lock_counter(lock_counter); return result; } result = UDS_ALLOCATE(locks, atomic_t, __func__, &lock_counter->logical_zone_counts); if (result != VDO_SUCCESS) { - free_vdo_lock_counter(lock_counter); + vdo_free_lock_counter(lock_counter); return result; } result = UDS_ALLOCATE(locks * physical_zones, uint16_t, __func__, &lock_counter->physical_counters); if (result != VDO_SUCCESS) { - free_vdo_lock_counter(lock_counter); + vdo_free_lock_counter(lock_counter); return result; } result = UDS_ALLOCATE(locks, atomic_t, __func__, &lock_counter->physical_zone_counts); if (result != VDO_SUCCESS) { - free_vdo_lock_counter(lock_counter); + vdo_free_lock_counter(lock_counter); return result; } - initialize_vdo_completion(&lock_counter->completion, vdo, + vdo_initialize_completion(&lock_counter->completion, vdo, VDO_LOCK_COUNTER_COMPLETION); - set_vdo_completion_callback_with_parent(&lock_counter->completion, + vdo_set_completion_callback_with_parent(&lock_counter->completion, callback, thread_id, parent); @@ -146,8 +147,11 @@ int make_vdo_lock_counter(struct vdo *vdo, return VDO_SUCCESS; } -/**********************************************************************/ -void free_vdo_lock_counter(struct lock_counter *counter) +/** + * vdo_free_lock_counter() - Free a lock counter. + * @counter: The lock counter to free. + */ +void vdo_free_lock_counter(struct lock_counter *counter) { if (counter == NULL) { return; @@ -163,44 +167,44 @@ void free_vdo_lock_counter(struct lock_counter *counter) } /** - * Get a pointer to the zone count for a given lock on a given zone. + * get_zone_count_ptr() - Get a pointer to the zone count for a given lock + * on a given zone. + * @counter: The lock counter. + * @lock_number: The lock to get. + * @zone_type: The zone type whose count is desired. * - * @param counter The lock counter - * @param lock_number The lock to get - * @param zone_type The zone type whose count is desired - * - * @return A pointer to the zone count for the given lock and zone - **/ + * Return: A pointer to the zone count for the given lock and zone. + */ static inline atomic_t *get_zone_count_ptr(struct lock_counter *counter, block_count_t lock_number, enum vdo_zone_type zone_type) { - return ((zone_type == ZONE_TYPE_LOGICAL) + return ((zone_type == VDO_ZONE_TYPE_LOGICAL) ? &counter->logical_zone_counts[lock_number] : &counter->physical_zone_counts[lock_number]); } /** - * Get the zone counter for a given lock on a given zone. + * get_counter() - Get the zone counter for a given lock on a given zone. + * @counter: The lock counter. + * @lock_number: The lock to get. + * @zone_type: The zone type whose count is desired. + * @zone_id: The zone index whose count is desired. * - * @param counter The lock counter - * @param lock_number The lock to get - * @param zone_type The zone type whose count is desired - * @param zone_id The zone index whose count is desired - * - * @return The counter for the given lock and zone - **/ + * Return: The counter for the given lock and zone. + */ static inline uint16_t *get_counter(struct lock_counter *counter, block_count_t lock_number, enum vdo_zone_type zone_type, zone_count_t zone_id) { block_count_t zone_counter = (counter->locks * zone_id) + lock_number; - if (zone_type == ZONE_TYPE_JOURNAL) { + + if (zone_type == VDO_ZONE_TYPE_JOURNAL) { return &counter->journal_counters[zone_counter]; } - if (zone_type == ZONE_TYPE_LOGICAL) { + if (zone_type == VDO_ZONE_TYPE_LOGICAL) { return &counter->logical_counters[zone_counter]; } @@ -208,18 +212,18 @@ static inline uint16_t *get_counter(struct lock_counter *counter, } /** - * Check whether the journal zone is locked for a given lock. - * - * @param counter The lock_counter - * @param lock_number The lock to check + * is_journal_zone_locked() - Check whether the journal zone is locked for + * a given lock. + * @counter: The lock_counter. + * @lock_number: The lock to check. * - * @return true if the journal zone is locked - **/ + * Return: true if the journal zone is locked. + */ static bool is_journal_zone_locked(struct lock_counter *counter, block_count_t lock_number) { uint16_t journal_value = - *(get_counter(counter, lock_number, ZONE_TYPE_JOURNAL, 0)); + *(get_counter(counter, lock_number, VDO_ZONE_TYPE_JOURNAL, 0)); uint32_t decrements = atomic_read(&(counter->journal_decrement_counts[lock_number])); smp_rmb(); @@ -229,16 +233,26 @@ static bool is_journal_zone_locked(struct lock_counter *counter, return (journal_value != decrements); } -/**********************************************************************/ -bool is_vdo_lock_locked(struct lock_counter *lock_counter, +/** + * vdo_is_lock_locked() - Check whether a lock is locked for a zone type. + * @lock_counter: The set of locks to check. + * @lock_number: The lock to check. + * @zone_type: The type of the zone. + * + * If the recovery journal has a lock on the lock number, both logical + * and physical zones are considered locked. + * + * Return: true if the specified lock has references (is locked). + */ +bool vdo_is_lock_locked(struct lock_counter *lock_counter, block_count_t lock_number, enum vdo_zone_type zone_type) { atomic_t *zone_count; bool locked; - ASSERT_LOG_ONLY((zone_type != ZONE_TYPE_JOURNAL), - "is_vdo_lock_locked() called for non-journal zone"); + ASSERT_LOG_ONLY((zone_type != VDO_ZONE_TYPE_JOURNAL), + "vdo_is_lock_locked() called for non-journal zone"); if (is_journal_zone_locked(lock_counter, lock_number)) { return true; } @@ -250,11 +264,10 @@ bool is_vdo_lock_locked(struct lock_counter *lock_counter, } /** - * Check that we are on the journal thread. - * - * @param counter The lock_counter - * @param caller The name of the caller (for logging) - **/ + * assert_on_journal_thread() - Check that we are on the journal thread. + * @counter: The lock_counter. + * @caller: The name of the caller (for logging). + */ static void assert_on_journal_thread(struct lock_counter *counter, const char *caller) { @@ -263,8 +276,16 @@ static void assert_on_journal_thread(struct lock_counter *counter, "%s() called from journal zone", caller); } -/**********************************************************************/ -void initialize_vdo_lock_count(struct lock_counter *counter, +/** + * vdo_initialize_lock_count() - Initialize the value of the journal zone's + * counter for a given lock. + * @counter: The counter to initialize. + * @lock_number: Which lock to initialize. + * @value: The value to set. + * + * Context: This must be called from the journal zone. + */ +void vdo_initialize_lock_count(struct lock_counter *counter, block_count_t lock_number, uint16_t value) { @@ -273,7 +294,7 @@ void initialize_vdo_lock_count(struct lock_counter *counter, assert_on_journal_thread(counter, __func__); journal_value = - get_counter(counter, lock_number, ZONE_TYPE_JOURNAL, 0); + get_counter(counter, lock_number, VDO_ZONE_TYPE_JOURNAL, 0); decrement_count = &(counter->journal_decrement_counts[lock_number]); ASSERT_LOG_ONLY((*journal_value == atomic_read(decrement_count)), "count to be initialized not in use"); @@ -282,14 +303,24 @@ void initialize_vdo_lock_count(struct lock_counter *counter, atomic_set(decrement_count, 0); } -/**********************************************************************/ -void acquire_vdo_lock_count_reference(struct lock_counter *counter, +/** + * vdo_acquire_lock_count_reference() - Acquire a reference to a given lock + * in the specified zone. + * @counter: The lock_counter. + * @lock_number: Which lock to increment. + * @zone_type: The type of the zone acquiring the reference. + * @zone_id: The ID of the zone acquiring the reference. + * + * Context: This method must not be used from the journal zone. + */ +void vdo_acquire_lock_count_reference(struct lock_counter *counter, block_count_t lock_number, enum vdo_zone_type zone_type, zone_count_t zone_id) { uint16_t *current_value; - ASSERT_LOG_ONLY((zone_type != ZONE_TYPE_JOURNAL), + + ASSERT_LOG_ONLY((zone_type != VDO_ZONE_TYPE_JOURNAL), "invalid lock count increment from journal zone"); current_value = get_counter(counter, lock_number, zone_type, zone_id); @@ -297,9 +328,11 @@ void acquire_vdo_lock_count_reference(struct lock_counter *counter, "increment of lock counter must not overflow"); if (*current_value == 0) { - // This zone is acquiring this lock for the first time. - // Extra barriers because this was original developed using - // an atomic add operation that implicitly had them. + /* + * This zone is acquiring this lock for the first time. + * Extra barriers because this was original developed using + * an atomic add operation that implicitly had them. + */ smp_mb__before_atomic(); atomic_inc(get_zone_count_ptr(counter, lock_number, zone_type)); @@ -309,15 +342,14 @@ void acquire_vdo_lock_count_reference(struct lock_counter *counter, } /** - * Decrement a non-atomic counter. - * - * @param counter The lock_counter - * @param lock_number Which lock to decrement - * @param zone_type The type of the zone releasing the reference - * @param zone_id The ID of the zone releasing the reference + * release_reference() - Decrement a non-atomic counter. + * @counter: The lock_counter. + * @lock_number: Which lock to decrement. + * @zone_type: The type of the zone releasing the reference. + * @zone_id: The ID of the zone releasing the reference. * - * @return The new value of the counter - **/ + * Return: The new value of the counter. + */ static uint16_t release_reference(struct lock_counter *counter, block_count_t lock_number, enum vdo_zone_type zone_type, @@ -333,18 +365,21 @@ static uint16_t release_reference(struct lock_counter *counter, } /** - * Attempt to notify the owner of this lock_counter that some lock has been - * released for some zone type. Will do nothing if another notification is - * already in progress. + * attempt_notification() - Attempt to notify the owner of this lock_counter + * that some lock has been released for some zone + * type. + * @counter: The lock_counter. * - * @param counter The lock_counter - **/ + * Will do nothing if another notification is already in progress. + */ static void attempt_notification(struct lock_counter *counter) { int prior_state; - // Extra barriers because this was original developed using - // a CAS operation that implicitly had them. + /* + * Extra barriers because this was original developed using + * a CAS operation that implicitly had them. + */ smp_mb__before_atomic(); prior_state = atomic_cmpxchg(&counter->state, LOCK_COUNTER_STATE_NOT_NOTIFYING, @@ -355,19 +390,28 @@ static void attempt_notification(struct lock_counter *counter) return; } - reset_vdo_completion(&counter->completion); - invoke_vdo_completion_callback(&counter->completion); + vdo_reset_completion(&counter->completion); + vdo_invoke_completion_callback(&counter->completion); } -/**********************************************************************/ -void release_vdo_lock_count_reference(struct lock_counter *counter, +/** + * vdo_release_lock_count_reference() - Release a reference to a given lock + * in the specified zone. + * @counter: The lock_counter. + * @lock_number: Which lock to increment. + * @zone_type: The type of the zone releasing the reference. + * @zone_id: The ID of the zone releasing the reference. + * + * Context: This method must not be used from the journal zone. + */ +void vdo_release_lock_count_reference(struct lock_counter *counter, block_count_t lock_number, enum vdo_zone_type zone_type, zone_count_t zone_id) { atomic_t *zone_count; - ASSERT_LOG_ONLY((zone_type != ZONE_TYPE_JOURNAL), + ASSERT_LOG_ONLY((zone_type != VDO_ZONE_TYPE_JOURNAL), "invalid lock count decrement from journal zone"); if (release_reference(counter, lock_number, zone_type, zone_id) != 0) { return; @@ -375,52 +419,88 @@ void release_vdo_lock_count_reference(struct lock_counter *counter, zone_count = get_zone_count_ptr(counter, lock_number, zone_type); if (atomic_add_return(-1, zone_count) == 0) { - // This zone was the last lock holder of its type, so try to - // notify the owner. + /* + * This zone was the last lock holder of its type, so try to + * notify the owner. + */ attempt_notification(counter); } } -/**********************************************************************/ -void release_vdo_journal_zone_reference(struct lock_counter *counter, +/** + * vdo_release_journal_zone_reference() - Release a single journal zone + * reference from the journal zone. + * @counter: The counter from which to release a reference. + * @lock_number: The lock from which to release a reference. + * + * Context: This method must be called from the journal zone. + */ +void vdo_release_journal_zone_reference(struct lock_counter *counter, block_count_t lock_number) { assert_on_journal_thread(counter, __func__); - release_reference(counter, lock_number, ZONE_TYPE_JOURNAL, 0); + release_reference(counter, lock_number, VDO_ZONE_TYPE_JOURNAL, 0); if (!is_journal_zone_locked(counter, lock_number)) { - // The journal zone is not locked, so try to notify the owner. + /* The journal zone is not locked, so try to notify the owner. */ attempt_notification(counter); } } -/**********************************************************************/ +/** + * vdo_release_journal_zone_reference_from_other_zone() - Release a single + * journal zone + * reference from any + * zone. + * @counter: The counter from which to release a reference. + * @lock_number: The lock from which to release a reference. + * + * Context: This method shouldn't be called from the journal zone as + * it would be inefficient; use vdo_release_journal_zone_reference() + * instead. + */ void -release_vdo_journal_zone_reference_from_other_zone(struct lock_counter *counter, +vdo_release_journal_zone_reference_from_other_zone(struct lock_counter *counter, block_count_t lock_number) { - // Extra barriers because this was original developed using - // an atomic add operation that implicitly had them. + /* + * Extra barriers because this was original developed using + * an atomic add operation that implicitly had them. + */ smp_mb__before_atomic(); atomic_inc(&(counter->journal_decrement_counts[lock_number])); smp_mb__after_atomic(); } -/**********************************************************************/ -void acknowledge_vdo_lock_unlock(struct lock_counter *counter) +/** + * vdo_acknowledge_lock_unlock() - Inform a lock counter that an unlock + * notification was received by the caller. + * + * @counter: The counter to inform. + */ +void vdo_acknowledge_lock_unlock(struct lock_counter *counter) { smp_wmb(); atomic_set(&counter->state, LOCK_COUNTER_STATE_NOT_NOTIFYING); } -/**********************************************************************/ -bool suspend_vdo_lock_counter(struct lock_counter *counter) +/** + * vdo_suspend_lock_counter() - Prevent the lock counter from issuing + * notifications. + * @counter: The counter. + * + * Return: true if the lock counter was not notifying and hence + * the suspend was efficacious. + */ +bool vdo_suspend_lock_counter(struct lock_counter *counter) { int prior_state; assert_on_journal_thread(counter, __func__); - // Extra barriers because this was original developed using - // a CAS operation that implicitly had them. + /* + * Extra barriers because this was original developed using + * a CAS operation that implicitly had them. + */ smp_mb__before_atomic(); prior_state = atomic_cmpxchg(&counter->state, LOCK_COUNTER_STATE_NOT_NOTIFYING, @@ -431,15 +511,23 @@ bool suspend_vdo_lock_counter(struct lock_counter *counter) || (prior_state == LOCK_COUNTER_STATE_NOT_NOTIFYING)); } -/**********************************************************************/ -bool resume_vdo_lock_counter(struct lock_counter *counter) +/** + * vdo_resume_lock_counter() - Re-allow notifications from a suspended lock + * counter. + * @counter: The counter. + * + * Return: true if the lock counter was suspended. + */ +bool vdo_resume_lock_counter(struct lock_counter *counter) { int prior_state; assert_on_journal_thread(counter, __func__); - // Extra barriers because this was original developed using - // a CAS operation that implicitly had them. + /* + * Extra barriers because this was original developed using + * a CAS operation that implicitly had them. + */ smp_mb__before_atomic(); prior_state = atomic_cmpxchg(&counter->state, LOCK_COUNTER_STATE_SUSPENDED, diff --git a/vdo/lock-counter.h b/vdo/lock-counter.h new file mode 100644 index 00000000..8f201d5f --- /dev/null +++ b/vdo/lock-counter.h @@ -0,0 +1,72 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright Red Hat + */ + +#ifndef LOCK_COUNTER_H +#define LOCK_COUNTER_H + +#include "completion.h" +#include "types.h" + +/** + * DOC: Lock Counters. + * + * A lock_counter provides a set of shared reference count locks which is safe + * across multiple zones with a minimum of cross-thread synchronization + * operations. For each lock in the set, it maintains a set of per-zone lock + * counts, and a single, atomic count of the number of zones holding locks. + * Whenever a zone's individual counter for a lock goes from 0 to 1, the + * zone count for that lock is incremented. Whenever a zone's individual + * counter for a lock goes from 1 to 0, the zone count for that lock is + * decremented. If the zone count goes to 0, and the lock counter's + * completion is not in use, the completion is launched to inform the counter's + * owner that some lock has been released. It is the owner's responsibility to + * check for which locks have been released, and to inform the lock counter + * that it has received the notification by calling + * vdo_acknowledge_lock_unlock(). + */ + +int __must_check vdo_make_lock_counter(struct vdo *vdo, + void *parent, + vdo_action callback, + thread_id_t thread_id, + zone_count_t logical_zones, + zone_count_t physical_zones, + block_count_t locks, + struct lock_counter **lock_counter_ptr); + +void vdo_free_lock_counter(struct lock_counter *counter); + +bool __must_check vdo_is_lock_locked(struct lock_counter *lock_counter, + block_count_t lock_number, + enum vdo_zone_type zone_type); + +void vdo_initialize_lock_count(struct lock_counter *counter, + block_count_t lock_number, + uint16_t value); + +void vdo_acquire_lock_count_reference(struct lock_counter *counter, + block_count_t lock_number, + enum vdo_zone_type zone_type, + zone_count_t zone_id); + +void vdo_release_lock_count_reference(struct lock_counter *counter, + block_count_t lock_number, + enum vdo_zone_type zone_type, + zone_count_t zone_id); + +void vdo_release_journal_zone_reference(struct lock_counter *counter, + block_count_t lock_number); + +void +vdo_release_journal_zone_reference_from_other_zone(struct lock_counter *counter, + block_count_t lock_number); + +void vdo_acknowledge_lock_unlock(struct lock_counter *counter); + +bool __must_check vdo_suspend_lock_counter(struct lock_counter *counter); + +bool __must_check vdo_resume_lock_counter(struct lock_counter *counter); + +#endif /* LOCK_COUNTER_H */ diff --git a/vdo/lockCounter.h b/vdo/lockCounter.h deleted file mode 100644 index bae66689..00000000 --- a/vdo/lockCounter.h +++ /dev/null @@ -1,180 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/lockCounter.h#7 $ - */ - -#ifndef LOCK_COUNTER_H -#define LOCK_COUNTER_H - -#include "completion.h" -#include "types.h" - -/** - * A lock_counter provides a set of shared reference count locks which is safe - * across multiple zones with a minimum of cross-thread synchronization - * operations. For each lock in the set, it maintains a set of per-zone lock - * counts, and a single, atomic count of the number of zones holding locks. - * Whenever a zone's individual counter for a lock goes from 0 to 1, the - * zone count for that lock is incremented. Whenever a zone's individual - * counter for a lock goes from 1 to 0, the zone count for that lock is - * decremented. If the zone count goes to 0, and the lock counter's - * completion is not in use, the completion is launched to inform the counter's - * owner that some lock has been released. It is the owner's responsibility to - * check for which locks have been released, and to inform the lock counter - * that it has received the notification by calling - * acknowledge_vdo_lock_unlock(). - **/ - -/** - * Create a lock counter. - * - * @param [in] vdo The VDO - * @param [in] parent The parent to notify when the lock count goes - * to zero - * @param [in] callback The function to call when the lock count goes - * to zero - * @param [in] thread_id The id of thread on which to run the callback - * @param [in] logical_zones The total number of logical zones - * @param [in] physical_zones The total number of physical zones - * @param [in] locks The number of locks - * @param [out] lock_counter_ptr A pointer to hold the new counter - * - * @return VDO_SUCCESS or an error - **/ -int __must_check make_vdo_lock_counter(struct vdo *vdo, - void *parent, - vdo_action callback, - thread_id_t thread_id, - zone_count_t logical_zones, - zone_count_t physical_zones, - block_count_t locks, - struct lock_counter **lock_counter_ptr); - -/** - * Free a lock counter. - * - * @param counter The lock counter to free - **/ -void free_vdo_lock_counter(struct lock_counter *counter); - -/** - * Check whether a lock is locked for a zone type. If the recovery journal has - * a lock on the lock number, both logical and physical zones are considered - * locked. - * - * @param lock_counter The set of locks to check - * @param lock_number The lock to check - * @param zone_type The type of the zone - * - * @return true if the specified lock has references (is locked) - **/ -bool __must_check is_vdo_lock_locked(struct lock_counter *lock_counter, - block_count_t lock_number, - enum vdo_zone_type zone_type); - -/** - * Initialize the value of the journal zone's counter for a given lock. This - * must be called from the journal zone. - * - * @param counter The counter to initialize - * @param lock_number Which lock to initialize - * @param value The value to set - **/ -void initialize_vdo_lock_count(struct lock_counter *counter, - block_count_t lock_number, - uint16_t value); - -/** - * Acquire a reference to a given lock in the specified zone. This method must - * not be used from the journal zone. - * - * @param counter The lock_counter - * @param lock_number Which lock to increment - * @param zone_type The type of the zone acquiring the reference - * @param zone_id The ID of the zone acquiring the reference - **/ -void acquire_vdo_lock_count_reference(struct lock_counter *counter, - block_count_t lock_number, - enum vdo_zone_type zone_type, - zone_count_t zone_id); - -/** - * Release a reference to a given lock in the specified zone. This method - * must not be used from the journal zone. - * - * @param counter The lock_counter - * @param lock_number Which lock to increment - * @param zone_type The type of the zone releasing the reference - * @param zone_id The ID of the zone releasing the reference - **/ -void release_vdo_lock_count_reference(struct lock_counter *counter, - block_count_t lock_number, - enum vdo_zone_type zone_type, - zone_count_t zone_id); - -/** - * Release a single journal zone reference from the journal zone. This method - * must be called from the journal zone. - * - * @param counter The counter from which to release a reference - * @param lock_number The lock from which to release a reference - **/ -void release_vdo_journal_zone_reference(struct lock_counter *counter, - block_count_t lock_number); - -/** - * Release a single journal zone reference from any zone. This method shouldn't - * be called from the journal zone as it would be inefficient; use - * release_vdo_journal_zone_reference() instead. - * - * @param counter The counter from which to release a reference - * @param lock_number The lock from which to release a reference - **/ -void -release_vdo_journal_zone_reference_from_other_zone(struct lock_counter *counter, - block_count_t lock_number); - -/** - * Inform a lock counter that an unlock notification was received by the - * caller. - * - * @param counter The counter to inform - **/ -void acknowledge_vdo_lock_unlock(struct lock_counter *counter); - -/** - * Prevent the lock counter from issuing notifications. - * - * @param counter The counter - * - * @return true if the lock counter was not notifying and hence - * the suspend was efficacious - **/ -bool __must_check suspend_vdo_lock_counter(struct lock_counter *counter); - -/** - * Re-allow notifications from a suspended lock counter. - * - * @param counter The counter - * - * @return true if the lock counter was suspended - **/ -bool __must_check resume_vdo_lock_counter(struct lock_counter *counter); - -#endif // LOCK_COUNTER_H diff --git a/uds/loggerLinuxKernel.c b/vdo/logger.c similarity index 56% rename from uds/loggerLinuxKernel.c rename to vdo/logger.c index a2d88090..2154fb10 100644 --- a/uds/loggerLinuxKernel.c +++ b/vdo/logger.c @@ -1,33 +1,84 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/krusty/kernelLinux/uds/loggerLinuxKernel.c#20 $ */ +#include "logger.h" + #include #include #include #include -#include "logger.h" -#include "threadDevice.h" +#include "thread-device.h" +#include "uds-threads.h" + +struct priority_name { + const char *name; + const int priority; +}; + +static const struct priority_name PRIORITIES[] = { + { "ALERT", UDS_LOG_ALERT }, + { "CRITICAL", UDS_LOG_CRIT }, + { "CRIT", UDS_LOG_CRIT }, + { "DEBUG", UDS_LOG_DEBUG }, + { "EMERGENCY", UDS_LOG_EMERG }, + { "EMERG", UDS_LOG_EMERG }, + { "ERROR", UDS_LOG_ERR }, + { "ERR", UDS_LOG_ERR }, + { "INFO", UDS_LOG_INFO }, + { "NOTICE", UDS_LOG_NOTICE }, + { "PANIC", UDS_LOG_EMERG }, + { "WARN", UDS_LOG_WARNING }, + { "WARNING", UDS_LOG_WARNING }, + { NULL, -1 }, +}; + +static const char *const PRIORITY_STRINGS[] = { + "EMERGENCY", + "ALERT", + "CRITICAL", + "ERROR", + "WARN", + "NOTICE", + "INFO", + "DEBUG", +}; + +static int log_level = UDS_LOG_INFO; + +int get_uds_log_level(void) +{ + return log_level; +} + +void set_uds_log_level(int new_log_level) +{ + log_level = new_log_level; +} + +int uds_log_string_to_priority(const char *string) +{ + int i; + + for (i = 0; PRIORITIES[i].name != NULL; i++) { + if (strcasecmp(string, PRIORITIES[i].name) == 0) { + return PRIORITIES[i].priority; + } + } + return UDS_LOG_INFO; +} + +const char *uds_log_priority_to_string(int priority) +{ + if ((priority < 0) || + (priority >= (int) ARRAY_SIZE(PRIORITY_STRINGS))) { + return "unknown"; + } + return PRIORITY_STRINGS[priority]; +} -/**********************************************************************/ static const char *priority_to_log_level(int priority) { switch (priority) { @@ -50,7 +101,6 @@ static const char *priority_to_log_level(int priority) } } -/**********************************************************************/ static const char *get_current_interrupt_type(void) { if (in_nmi()) { @@ -90,17 +140,22 @@ static void emit_log_message(const char *level, { int device_instance; - // In interrupt context, identify the interrupt type and module. - // Ignore the process/thread since it could be anything. + /* + * In interrupt context, identify the interrupt type and module. + * Ignore the process/thread since it could be anything. + */ if (in_interrupt()) { const char *type = get_current_interrupt_type(); + printk("%s%s[%s]: %s%pV%pV\n", level, module, type, prefix, vaf1, vaf2); return; } - // Not at interrupt level; we have a process we can look at, and - // might have a device ID. + /* + * Not at interrupt level; we have a process we can look at, and + * might have a device ID. + */ device_instance = uds_get_thread_device_id(); if (device_instance >= 0) { printk("%s%s%u:%s: %s%pV%pV\n", @@ -114,8 +169,10 @@ static void emit_log_message(const char *level, return; } - // If it's a kernel thread and the module name is a prefix of its - // name, assume it is ours and only identify the thread. + /* + * If it's a kernel thread and the module name is a prefix of its + * name, assume it is ours and only identify the thread. + */ if (((current->flags & PF_KTHREAD) != 0) && (strncmp(module, current->comm, strlen(module)) == 0)) { printk("%s%s: %s%pV%pV\n", @@ -123,12 +180,11 @@ static void emit_log_message(const char *level, return; } - // Identify the module and the process. + /* Identify the module and the process. */ printk("%s%s: %s: %s%pV%pV\n", level, module, current->comm, prefix, vaf1, vaf2); } -/**********************************************************************/ void uds_log_message_pack(int priority, const char *module, const char *prefix, @@ -174,7 +230,55 @@ void uds_log_message_pack(int priority, va_end(args2_copy); } -/**********************************************************************/ +void uds_log_embedded_message(int priority, + const char *module, + const char *prefix, + const char *fmt1, + va_list args1, + const char *fmt2, + ...) +{ + va_list ap; + + va_start(ap, fmt2); + uds_log_message_pack(priority, module, prefix, fmt1, args1, fmt2, ap); + va_end(ap); +} + +int uds_vlog_strerror(int priority, + int errnum, + const char *module, + const char *format, + va_list args) +{ + char errbuf[UDS_MAX_ERROR_MESSAGE_SIZE]; + const char *message = uds_string_error(errnum, errbuf, sizeof(errbuf)); + + uds_log_embedded_message(priority, + module, + NULL, + format, + args, + ": %s (%d)", + message, + errnum); + return errnum; +} + +int __uds_log_strerror(int priority, + int errnum, + const char *module, + const char *format, + ...) +{ + va_list args; + + va_start(args, format); + uds_vlog_strerror(priority, errnum, module, format, args); + va_end(args); + return errnum; +} + void uds_log_backtrace(int priority) { if (priority > get_uds_log_level()) { @@ -183,7 +287,6 @@ void uds_log_backtrace(int priority) dump_stack(); } -/**********************************************************************/ void __uds_log_message(int priority, const char *module, const char *format, @@ -197,10 +300,8 @@ void __uds_log_message(int priority, va_end(args); } -/**********************************************************************/ void uds_pause_for_logger(void) { - // Hopefully, a few milliseconds of sleep will be large enough - // for the kernel log buffer to be flushed. - msleep(4); + /* Allow a few milliseconds for the kernel log buffer to be flushed. */ + fsleep(4000); } diff --git a/uds/logger.h b/vdo/logger.h similarity index 85% rename from uds/logger.h rename to vdo/logger.h index a84465c6..52fb6ecd 100644 --- a/uds/logger.h +++ b/vdo/logger.h @@ -1,22 +1,6 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/krusty/src/uds/logger.h#28 $ */ #ifndef LOGGER_H @@ -24,7 +8,6 @@ #include #include -#include #define UDS_LOG_EMERG 0 /* system is unusable */ #define UDS_LOG_ALERT 1 /* action must be taken immediately */ @@ -35,9 +18,9 @@ #define UDS_LOG_INFO 6 /* informational */ #define UDS_LOG_DEBUG 7 /* debug-level messages */ -#if defined(__KERNEL__) && defined(MODULE) +#if defined(MODULE) #define UDS_LOGGING_MODULE_NAME THIS_MODULE->name -#else // either userspace or compiled into the kernel +#else /* compiled into the kernel */ #define UDS_LOGGING_MODULE_NAME "vdo" #endif @@ -201,23 +184,18 @@ int uds_vlog_strerror(int priority, #define uds_log_error_strerror(errnum, ...) \ uds_log_strerror(UDS_LOG_ERR, errnum, __VA_ARGS__); -/**********************************************************************/ #define uds_log_debug_strerror(errnum, ...) \ uds_log_strerror(UDS_LOG_DEBUG, errnum, __VA_ARGS__); -/**********************************************************************/ #define uds_log_info_strerror(errnum, ...) \ uds_log_strerror(UDS_LOG_INFO, errnum, __VA_ARGS__); -/**********************************************************************/ #define uds_log_notice_strerror(errnum, ...) \ uds_log_strerror(UDS_LOG_NOTICE, errnum, __VA_ARGS__); -/**********************************************************************/ #define uds_log_warning_strerror(errnum, ...) \ uds_log_strerror(UDS_LOG_WARNING, errnum, __VA_ARGS__); -/**********************************************************************/ #define uds_log_fatal_strerror(errnum, ...) \ uds_log_strerror(UDS_LOG_CRIT, errnum, __VA_ARGS__); diff --git a/vdo/logical-zone.c b/vdo/logical-zone.c new file mode 100644 index 00000000..02877f68 --- /dev/null +++ b/vdo/logical-zone.c @@ -0,0 +1,423 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright Red Hat + */ + +#include "logical-zone.h" + +#include "logger.h" +#include "memory-alloc.h" +#include "permassert.h" + +#include "action-manager.h" +#include "admin-state.h" +#include "allocation-selector.h" +#include "block-map.h" +#include "completion.h" +#include "constants.h" +#include "data-vio.h" +#include "flush.h" +#include "int-map.h" +#include "vdo.h" + +/** + * as_logical_zone() - Convert a generic vdo_completion to a logical_zone. + * @completion: The completion to convert. + * + * Return: The completion as a logical_zone. + */ +static struct logical_zone *as_logical_zone(struct vdo_completion *completion) +{ + vdo_assert_completion_type(completion->type, + VDO_GENERATION_FLUSHED_COMPLETION); + return container_of(completion, struct logical_zone, completion); +} + +/** + * get_thread_id_for_zone() - Implements vdo_zone_thread_getter. + */ +static thread_id_t get_thread_id_for_zone(void *context, + zone_count_t zone_number) +{ + struct logical_zones *zones = context; + + return zones->zones[zone_number].thread_id; +} + +/** + * initialize_zone() - Initialize a logical zone. + * @zones: The logical_zones to which this zone belongs. + * @zone_number: The logical_zone's index. + */ +static int initialize_zone(struct logical_zones *zones, + zone_count_t zone_number) +{ + int result; + struct vdo *vdo = zones->vdo; + struct logical_zone *zone = &zones->zones[zone_number]; + zone_count_t physical_zone_count = + vdo->thread_config->physical_zone_count; + + result = make_int_map(VDO_LOCK_MAP_CAPACITY, 0, &zone->lbn_operations); + if (result != VDO_SUCCESS) { + return result; + } + + if (zone_number < vdo->thread_config->logical_zone_count - 1) { + zone->next = &zones->zones[zone_number + 1]; + } + + vdo_initialize_completion(&zone->completion, vdo, + VDO_GENERATION_FLUSHED_COMPLETION); + zone->zones = zones; + zone->zone_number = zone_number; + zone->thread_id = vdo_get_logical_zone_thread(vdo->thread_config, + zone_number); + zone->block_map_zone = &vdo->block_map->zones[zone_number]; + INIT_LIST_HEAD(&zone->write_vios); + vdo_set_admin_state_code(&zone->state, + VDO_ADMIN_STATE_NORMAL_OPERATION); + + result = vdo_make_allocation_selector(physical_zone_count, + zone->thread_id, + &zone->selector); + if (result != VDO_SUCCESS) { + return result; + } + + return vdo_make_default_thread(vdo, zone->thread_id); +} + +/** + * vdo_make_logical_zones() - Create a set of logical zones. + * @vdo: The vdo to which the zones will belong. + * @zones_ptr: A pointer to hold the new zones. + * + * Return: VDO_SUCCESS or an error code. + */ +int vdo_make_logical_zones(struct vdo *vdo, struct logical_zones **zones_ptr) +{ + struct logical_zones *zones; + int result; + zone_count_t zone; + zone_count_t zone_count = vdo->thread_config->logical_zone_count; + + if (zone_count == 0) { + return VDO_SUCCESS; + } + + result = UDS_ALLOCATE_EXTENDED(struct logical_zones, zone_count, + struct logical_zone, __func__, &zones); + if (result != VDO_SUCCESS) { + return result; + } + + zones->vdo = vdo; + zones->zone_count = zone_count; + for (zone = 0; zone < zone_count; zone++) { + result = initialize_zone(zones, zone); + if (result != VDO_SUCCESS) { + vdo_free_logical_zones(zones); + return result; + } + } + + result = vdo_make_action_manager(zones->zone_count, + get_thread_id_for_zone, + vdo->thread_config->admin_thread, + zones, + NULL, + vdo, + &zones->manager); + if (result != VDO_SUCCESS) { + vdo_free_logical_zones(zones); + return result; + } + + *zones_ptr = zones; + return VDO_SUCCESS; +} + +/** + * vdo_free_logical_zones() - Free a set of logical zones. + * @zones: The set of zones to free. + */ +void vdo_free_logical_zones(struct logical_zones *zones) +{ + zone_count_t index; + + if (zones == NULL) { + return; + } + + UDS_FREE(UDS_FORGET(zones->manager)); + + for (index = 0; index < zones->zone_count; index++) { + struct logical_zone *zone = &zones->zones[index]; + + UDS_FREE(UDS_FORGET(zone->selector)); + free_int_map(UDS_FORGET(zone->lbn_operations)); + } + + UDS_FREE(zones); +} + +static inline void assert_on_zone_thread(struct logical_zone *zone, + const char *what) +{ + ASSERT_LOG_ONLY((vdo_get_callback_thread_id() == zone->thread_id), + "%s() called on correct thread", what); +} + +/** + * check_for_drain_complete() - Check whether this zone has drained. + * @zone: The zone to check. + */ +static void check_for_drain_complete(struct logical_zone *zone) +{ + if (!vdo_is_state_draining(&zone->state) || zone->notifying + || !list_empty(&zone->write_vios)) { + return; + } + + vdo_finish_draining(&zone->state); +} + +/** + * initiate_drain() - Initiate a drain. + * + * Implements vdo_admin_initiator. + */ +static void initiate_drain(struct admin_state *state) +{ + check_for_drain_complete(container_of(state, + struct logical_zone, + state)); +} + +/** + * drain_logical_zone() - Drain a logical zone. + * + * Implements vdo_zone_action. + */ +static void drain_logical_zone(void *context, + zone_count_t zone_number, + struct vdo_completion *parent) +{ + struct logical_zones *zones = context; + + vdo_start_draining(&zones->zones[zone_number].state, + vdo_get_current_manager_operation(zones->manager), + parent, + initiate_drain); +} + +void vdo_drain_logical_zones(struct logical_zones *zones, + const struct admin_state_code *operation, + struct vdo_completion *parent) +{ + vdo_schedule_operation(zones->manager, operation, NULL, + drain_logical_zone, NULL, parent); +} + +/** + * resume_logical_zone() - Resume a logical zone. + * + * Implements vdo_zone_action. + */ +static void resume_logical_zone(void *context, + zone_count_t zone_number, + struct vdo_completion *parent) +{ + struct logical_zone *zone + = &(((struct logical_zones *) context)->zones[zone_number]); + + vdo_finish_completion(parent, vdo_resume_if_quiescent(&zone->state)); +} + +/** + * vdo_resume_logical_zones() - Resume a set of logical zones. + * @zones: The logical zones to resume. + * @parent: The object to notify when the zones have resumed. + */ +void vdo_resume_logical_zones(struct logical_zones *zones, + struct vdo_completion *parent) +{ + vdo_schedule_operation(zones->manager, VDO_ADMIN_STATE_RESUMING, NULL, + resume_logical_zone, NULL, parent); +} + +/** + * update_oldest_active_generation() - Update the oldest active generation. + * @zone: The zone. + * + * Return: true if the oldest active generation has changed. + */ +static bool update_oldest_active_generation(struct logical_zone *zone) +{ + sequence_number_t oldest; + + if (list_empty(&zone->write_vios)) { + oldest = zone->flush_generation; + } else { + struct data_vio *data_vio = list_entry(zone->write_vios.next, + struct data_vio, + write_entry); + oldest = data_vio->flush_generation; + } + + if (oldest == zone->oldest_active_generation) { + return false; + } + + WRITE_ONCE(zone->oldest_active_generation, oldest); + return true; +} + +/** + * vdo_increment_logical_zone_flush_generation() - Increment the flush + * generation in a logical zone. + * @zone: The logical zone. + * @expected_generation: The expected value of the flush generation + * before the increment. + */ +void +vdo_increment_logical_zone_flush_generation(struct logical_zone *zone, + sequence_number_t expected_generation) +{ + assert_on_zone_thread(zone, __func__); + ASSERT_LOG_ONLY((zone->flush_generation == expected_generation), + "logical zone %u flush generation %llu should be %llu before increment", + zone->zone_number, + (unsigned long long) zone->flush_generation, + (unsigned long long) expected_generation); + + zone->flush_generation++; + zone->ios_in_flush_generation = 0; + update_oldest_active_generation(zone); +} + +/** + * vdo_acquire_flush_generation_lock() - Acquire the shared lock on a flush + * generation by a write data_vio. + * @data_vio: The data_vio. + * + * Return: VDO_SUCCESS or an error code. + */ +int vdo_acquire_flush_generation_lock(struct data_vio *data_vio) +{ + struct logical_zone *zone = data_vio->logical.zone; + + assert_on_zone_thread(zone, __func__); + if (!vdo_is_state_normal(&zone->state)) { + return VDO_INVALID_ADMIN_STATE; + } + + data_vio->flush_generation = zone->flush_generation; + list_move_tail(&data_vio->write_entry, &zone->write_vios); + data_vio->has_flush_generation_lock = true; + zone->ios_in_flush_generation++; + return VDO_SUCCESS; +} + +static void +attempt_generation_complete_notification(struct vdo_completion *completion); + +/** + * notify_flusher() - Notify the flush that at least one generation no longer + * has active VIOs. + * @completion: The zone completion. + * + * This callback is registered in attempt_generation_complete_notification(). + */ +static void notify_flusher(struct vdo_completion *completion) +{ + struct logical_zone *zone = as_logical_zone(completion); + + vdo_complete_flushes(zone->zones->vdo->flusher); + vdo_launch_completion_callback(completion, + attempt_generation_complete_notification, + zone->thread_id); +} + +/** + * void attempt_generation_complete_notification() - Notify the flusher if + * some generation no longer + * has active VIOs. + * @completion: The zone completion. + */ +static void +attempt_generation_complete_notification(struct vdo_completion *completion) +{ + struct logical_zone *zone = as_logical_zone(completion); + + assert_on_zone_thread(zone, __func__); + if (zone->oldest_active_generation <= zone->notification_generation) { + zone->notifying = false; + check_for_drain_complete(zone); + return; + } + + zone->notifying = true; + zone->notification_generation = zone->oldest_active_generation; + vdo_launch_completion_callback(&zone->completion, notify_flusher, + vdo_get_flusher_thread_id(zone->zones->vdo->flusher)); +} + +/** + * vdo_release_flush_generation_lock() - Release the shared lock on a flush + * generation held by a write data_vio. + * @data_vio: The data_vio whose lock is to be released. + * + * If there are pending flushes, and this data_vio completes the oldest + * generation active in this zone, an attempt will be made to finish any + * flushes which may now be complete. + */ +void vdo_release_flush_generation_lock(struct data_vio *data_vio) +{ + struct logical_zone *zone = data_vio->logical.zone; + + assert_on_zone_thread(zone, __func__); + if (list_empty(&data_vio->write_entry)) { + /* + * This VIO never got a lock, either because it is a read, or + * because we are in read-only mode. + */ + ASSERT_LOG_ONLY(!data_vio->has_flush_generation_lock, + "has_flush_generation_lock false for VIO not on active list"); + return; + } + + list_del_init(&data_vio->write_entry); + data_vio->has_flush_generation_lock = false; + ASSERT_LOG_ONLY(zone->oldest_active_generation + <= data_vio->flush_generation, + "data_vio releasing lock on generation %llu is not older than oldest active generation %llu", + (unsigned long long) data_vio->flush_generation, + (unsigned long long) zone->oldest_active_generation); + + if (!update_oldest_active_generation(zone) || zone->notifying) { + return; + } + + attempt_generation_complete_notification(&zone->completion); +} + +/** + * vdo_dump_logical_zone() - Dump information about a logical zone to the log + * for debugging. + * @zone: The zone to dump + * + * Context: the information is dumped in a thread-unsafe fashion. + * + */ +void vdo_dump_logical_zone(const struct logical_zone *zone) +{ + uds_log_info("logical_zone %u", zone->zone_number); + uds_log_info(" flush_generation=%llu oldest_active_generation=%llu notification_generation=%llu notifying=%s ios_in_flush_generation=%llu", + (unsigned long long) READ_ONCE(zone->flush_generation), + (unsigned long long) READ_ONCE(zone->oldest_active_generation), + (unsigned long long) READ_ONCE(zone->notification_generation), + uds_bool_to_string(READ_ONCE(zone->notifying)), + (unsigned long long) READ_ONCE(zone->ios_in_flush_generation)); +} diff --git a/vdo/logical-zone.h b/vdo/logical-zone.h new file mode 100644 index 00000000..ea453bb6 --- /dev/null +++ b/vdo/logical-zone.h @@ -0,0 +1,84 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright Red Hat + */ + +#ifndef LOGICAL_ZONE_H +#define LOGICAL_ZONE_H + +#include + +#include "admin-state.h" +#include "int-map.h" +#include "types.h" + +struct logical_zone { + /* The completion for flush notifications */ + struct vdo_completion completion; + /* The owner of this zone */ + struct logical_zones *zones; + /* Which logical zone this is */ + zone_count_t zone_number; + /* The thread id for this zone */ + thread_id_t thread_id; + /* In progress operations keyed by LBN */ + struct int_map *lbn_operations; + /* The logical to physical map */ + struct block_map_zone *block_map_zone; + /* The current flush generation */ + sequence_number_t flush_generation; + /* + * The oldest active generation in this zone. This is mutated only on + * the logical zone thread but is queried from the flusher thread. + */ + sequence_number_t oldest_active_generation; + /* The number of IOs in the current flush generation */ + block_count_t ios_in_flush_generation; + /* The youngest generation of the current notification */ + sequence_number_t notification_generation; + /* Whether a notification is in progress */ + bool notifying; + /* The queue of active data write VIOs */ + struct list_head write_vios; + /* The administrative state of the zone */ + struct admin_state state; + /* The selector for determining which physical zone to allocate from */ + struct allocation_selector *selector; + /* The next zone */ + struct logical_zone *next; +}; + +struct logical_zones { + /* The vdo whose zones these are */ + struct vdo *vdo; + /* The manager for administrative actions */ + struct action_manager *manager; + /* The number of zones */ + zone_count_t zone_count; + /* The logical zones themselves */ + struct logical_zone zones[]; +}; + +int __must_check +vdo_make_logical_zones(struct vdo *vdo, struct logical_zones **zones_ptr); + +void vdo_free_logical_zones(struct logical_zones *zones); + +void vdo_drain_logical_zones(struct logical_zones *zones, + const struct admin_state_code *operation, + struct vdo_completion *completion); + +void vdo_resume_logical_zones(struct logical_zones *zones, + struct vdo_completion *parent); + +void +vdo_increment_logical_zone_flush_generation(struct logical_zone *zone, + sequence_number_t expected_generation); + +int __must_check vdo_acquire_flush_generation_lock(struct data_vio *data_vio); + +void vdo_release_flush_generation_lock(struct data_vio *data_vio); + +void vdo_dump_logical_zone(const struct logical_zone *zone); + +#endif /* LOGICAL_ZONE_H */ diff --git a/vdo/logicalZone.c b/vdo/logicalZone.c deleted file mode 100644 index 7b462926..00000000 --- a/vdo/logicalZone.c +++ /dev/null @@ -1,470 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/logicalZone.c#30 $ - */ - -#include "logicalZone.h" - -#include "logger.h" -#include "memoryAlloc.h" -#include "permassert.h" - -#include "actionManager.h" -#include "adminState.h" -#include "allocationSelector.h" -#include "blockMap.h" -#include "completion.h" -#include "constants.h" -#include "dataVIO.h" -#include "flush.h" -#include "intMap.h" -#include "vdoInternal.h" - -struct logical_zone { - /** The completion for flush notifications */ - struct vdo_completion completion; - /** The owner of this zone */ - struct logical_zones *zones; - /** Which logical zone this is */ - zone_count_t zone_number; - /** The thread id for this zone */ - thread_id_t thread_id; - /** In progress operations keyed by LBN */ - struct int_map *lbn_operations; - /** The logical to physical map */ - struct block_map_zone *block_map_zone; - /** The current flush generation */ - sequence_number_t flush_generation; - /** - * The oldest active generation in this zone. This is mutated only on - * the logical zone thread but is queried from the flusher thread. - **/ - sequence_number_t oldest_active_generation; - /** The number of IOs in the current flush generation */ - block_count_t ios_in_flush_generation; - /** The youngest generation of the current notification */ - sequence_number_t notification_generation; - /** Whether a notification is in progress */ - bool notifying; - /** The queue of active data write VIOs */ - struct list_head write_vios; - /** The administrative state of the zone */ - struct admin_state state; - /** The selector for determining which physical zone to allocate from */ - struct allocation_selector *selector; -}; - -struct logical_zones { - /** The vdo whose zones these are */ - struct vdo *vdo; - /** The manager for administrative actions */ - struct action_manager *manager; - /** The number of zones */ - zone_count_t zone_count; - /** The logical zones themselves */ - struct logical_zone zones[]; -}; - -/** - * Convert a generic vdo_completion to a logical_zone. - * - * @param completion The completion to convert - * - * @return The completion as a logical_zone - **/ -static struct logical_zone *as_logical_zone(struct vdo_completion *completion) -{ - assert_vdo_completion_type(completion->type, - VDO_GENERATION_FLUSHED_COMPLETION); - return container_of(completion, struct logical_zone, completion); -} - -/**********************************************************************/ -struct logical_zone *get_vdo_logical_zone(struct logical_zones *zones, - zone_count_t zone_number) -{ - return (zone_number < zones->zone_count) ? &zones->zones[zone_number] - : NULL; -} - -/** - * Implements vdo_zone_thread_getter - **/ -static thread_id_t get_thread_id_for_zone(void *context, - zone_count_t zone_number) -{ - return get_vdo_logical_zone_thread_id(get_vdo_logical_zone(context, - zone_number)); -} - -/** - * Initialize a logical zone. - * - * @param zones The logical_zones to which this zone belongs - * @param zone_number The logical_zone's index - **/ -static int initialize_zone(struct logical_zones *zones, - zone_count_t zone_number) -{ - struct vdo *vdo = zones->vdo; - struct logical_zone *zone = &zones->zones[zone_number]; - int result = make_int_map(VDO_LOCK_MAP_CAPACITY, 0, - &zone->lbn_operations); - if (result != VDO_SUCCESS) { - return result; - } - - initialize_vdo_completion(&zone->completion, vdo, - VDO_GENERATION_FLUSHED_COMPLETION); - zone->zones = zones; - zone->zone_number = zone_number; - zone->thread_id = vdo_get_logical_zone_thread(get_vdo_thread_config(vdo), - zone_number); - zone->block_map_zone = vdo_get_block_map_zone(vdo->block_map, zone_number); - INIT_LIST_HEAD(&zone->write_vios); - set_vdo_admin_state_code(&zone->state, - VDO_ADMIN_STATE_NORMAL_OPERATION); - - return make_vdo_allocation_selector(get_vdo_thread_config(vdo)->physical_zone_count, - zone->thread_id, &zone->selector); -} - -/**********************************************************************/ -int make_vdo_logical_zones(struct vdo *vdo, struct logical_zones **zones_ptr) -{ - struct logical_zones *zones; - int result; - zone_count_t zone; - - const struct thread_config *thread_config = get_vdo_thread_config(vdo); - if (thread_config->logical_zone_count == 0) { - return VDO_SUCCESS; - } - - result = UDS_ALLOCATE_EXTENDED(struct logical_zones, - thread_config->logical_zone_count, - struct logical_zone, __func__, &zones); - if (result != VDO_SUCCESS) { - return result; - } - - zones->vdo = vdo; - zones->zone_count = thread_config->logical_zone_count; - for (zone = 0; zone < thread_config->logical_zone_count; zone++) { - result = initialize_zone(zones, zone); - if (result != VDO_SUCCESS) { - free_vdo_logical_zones(zones); - return result; - } - } - - result = make_vdo_action_manager(zones->zone_count, - get_thread_id_for_zone, - thread_config->admin_thread, - zones, - NULL, - vdo, - &zones->manager); - if (result != VDO_SUCCESS) { - free_vdo_logical_zones(zones); - return result; - } - - *zones_ptr = zones; - return VDO_SUCCESS; -} - -/**********************************************************************/ -void free_vdo_logical_zones(struct logical_zones *zones) -{ - zone_count_t index; - - if (zones == NULL) { - return; - } - - UDS_FREE(UDS_FORGET(zones->manager)); - - for (index = 0; index < zones->zone_count; index++) { - struct logical_zone *zone = &zones->zones[index]; - UDS_FREE(UDS_FORGET(zone->selector)); - free_int_map(UDS_FORGET(zone->lbn_operations)); - } - - UDS_FREE(zones); -} - -/**********************************************************************/ -static inline void assert_on_zone_thread(struct logical_zone *zone, - const char *what) -{ - ASSERT_LOG_ONLY((vdo_get_callback_thread_id() == zone->thread_id), - "%s() called on correct thread", what); -} - -/** - * Check whether this zone has drained. - * - * @param zone The zone to check - **/ -static void vdo_check_for_drain_complete(struct logical_zone *zone) -{ - if (!is_vdo_state_draining(&zone->state) || zone->notifying - || !list_empty(&zone->write_vios)) { - return; - } - - finish_vdo_draining(&zone->state); -} - -/** - * Initiate a drain. - * - * Implements vdo_admin_initiator. - **/ -static void initiate_drain(struct admin_state *state) -{ - vdo_check_for_drain_complete(container_of(state, - struct logical_zone, - state)); -} - -/** - * Drain a logical zone. - * - *

Implements vdo_zone_action. - **/ -static void drain_logical_zone(void *context, zone_count_t zone_number, - struct vdo_completion *parent) -{ - struct logical_zone *zone = get_vdo_logical_zone(context, zone_number); - start_vdo_draining(&zone->state, - get_current_vdo_manager_operation(zone->zones->manager), - parent, initiate_drain); -} - -/**********************************************************************/ -void drain_vdo_logical_zones(struct logical_zones *zones, - const struct admin_state_code *operation, - struct vdo_completion *parent) -{ - schedule_vdo_operation(zones->manager, operation, NULL, - drain_logical_zone, NULL, parent); -} - -/** - * Resume a logical zone. - * - *

Implements vdo_zone_action. - **/ -static void resume_logical_zone(void *context, zone_count_t zone_number, - struct vdo_completion *parent) -{ - struct logical_zone *zone = get_vdo_logical_zone(context, zone_number); - finish_vdo_completion(parent, resume_vdo_if_quiescent(&zone->state)); -} - -/**********************************************************************/ -void resume_vdo_logical_zones(struct logical_zones *zones, - struct vdo_completion *parent) -{ - schedule_vdo_operation(zones->manager, VDO_ADMIN_STATE_RESUMING, NULL, - resume_logical_zone, NULL, parent); -} - -/**********************************************************************/ -thread_id_t get_vdo_logical_zone_thread_id(const struct logical_zone *zone) -{ - return zone->thread_id; -} - -/**********************************************************************/ -struct block_map_zone * -get_vdo_logical_zone_block_map(const struct logical_zone *zone) -{ - return zone->block_map_zone; -} - -/**********************************************************************/ -struct int_map * -get_vdo_logical_zone_lbn_lock_map(const struct logical_zone *zone) -{ - return zone->lbn_operations; -} - -/**********************************************************************/ -struct logical_zone *get_next_vdo_logical_zone(const struct logical_zone *zone) -{ - return get_vdo_logical_zone(zone->zones, zone->zone_number + 1); -} - -/** - * Update the oldest active generation. - * - * @param zone The zone - * - * @return true if the oldest active generation has changed - **/ -static bool update_oldest_active_generation(struct logical_zone *zone) -{ - sequence_number_t oldest; - if (list_empty(&zone->write_vios)) { - oldest = zone->flush_generation; - } else { - struct data_vio *data_vio = list_entry(zone->write_vios.next, - struct data_vio, - write_entry); - oldest = data_vio->flush_generation; - } - - if (oldest == zone->oldest_active_generation) { - return false; - } - - WRITE_ONCE(zone->oldest_active_generation, oldest); - return true; -} - -/**********************************************************************/ -void -increment_vdo_logical_zone_flush_generation(struct logical_zone *zone, - sequence_number_t expected_generation) -{ - assert_on_zone_thread(zone, __func__); - ASSERT_LOG_ONLY((zone->flush_generation == expected_generation), - "logical zone %u flush generation %llu should be %llu before increment", - zone->zone_number, - (unsigned long long) zone->flush_generation, - (unsigned long long) expected_generation); - - zone->flush_generation++; - zone->ios_in_flush_generation = 0; - update_oldest_active_generation(zone); -} - -/**********************************************************************/ -sequence_number_t -get_vdo_logical_zone_oldest_locked_generation(const struct logical_zone *zone) -{ - return READ_ONCE(zone->oldest_active_generation); -} - -/**********************************************************************/ -int acquire_vdo_flush_generation_lock(struct data_vio *data_vio) -{ - struct logical_zone *zone = data_vio->logical.zone; - assert_on_zone_thread(zone, __func__); - if (!is_vdo_state_normal(&zone->state)) { - return VDO_INVALID_ADMIN_STATE; - } - - data_vio->flush_generation = zone->flush_generation; - list_move_tail(&data_vio->write_entry, &zone->write_vios); - data_vio->has_flush_generation_lock = true; - zone->ios_in_flush_generation++; - return VDO_SUCCESS; -} - -/**********************************************************************/ -static void -attempt_generation_complete_notification(struct vdo_completion *completion); - -/** - * Notify the flush that at least one generation no longer has active VIOs. - * This callback is registered in attempt_generation_complete_notification(). - * - * @param completion The zone completion - **/ -static void notify_flusher(struct vdo_completion *completion) -{ - struct logical_zone *zone = as_logical_zone(completion); - complete_vdo_flushes(zone->zones->vdo->flusher); - launch_vdo_completion_callback(completion, - attempt_generation_complete_notification, - zone->thread_id); -} - -/** - * Notify the flusher if some generation no longer has active VIOs. - * - * @param completion The zone completion - **/ -static void -attempt_generation_complete_notification(struct vdo_completion *completion) -{ - struct logical_zone *zone = as_logical_zone(completion); - assert_on_zone_thread(zone, __func__); - if (zone->oldest_active_generation <= zone->notification_generation) { - zone->notifying = false; - vdo_check_for_drain_complete(zone); - return; - } - - zone->notifying = true; - zone->notification_generation = zone->oldest_active_generation; - launch_vdo_completion_callback(&zone->completion, notify_flusher, - get_vdo_flusher_thread_id(zone->zones->vdo->flusher)); -} - -/**********************************************************************/ -void release_vdo_flush_generation_lock(struct data_vio *data_vio) -{ - struct logical_zone *zone = data_vio->logical.zone; - assert_on_zone_thread(zone, __func__); - if (list_empty(&data_vio->write_entry)) { - // This VIO never got a lock, either because it is a read, or - // because we are in read-only mode. - ASSERT_LOG_ONLY(!data_vio->has_flush_generation_lock, - "has_flush_generation_lock false for VIO not on active list"); - return; - } - - list_del_init(&data_vio->write_entry); - data_vio->has_flush_generation_lock = false; - ASSERT_LOG_ONLY(zone->oldest_active_generation - <= data_vio->flush_generation, - "data_vio releasing lock on generation %llu is not older than oldest active generation %llu", - (unsigned long long) data_vio->flush_generation, - (unsigned long long) zone->oldest_active_generation); - - if (!update_oldest_active_generation(zone) || zone->notifying) { - return; - } - - attempt_generation_complete_notification(&zone->completion); -} - -/**********************************************************************/ -struct allocation_selector * -get_vdo_logical_zone_allocation_selector(struct logical_zone *zone) -{ - return zone->selector; -} - -/**********************************************************************/ -void dump_vdo_logical_zone(const struct logical_zone *zone) -{ - uds_log_info("logical_zone %u", zone->zone_number); - uds_log_info(" flush_generation=%llu oldest_active_generation=%llu notification_generation=%llu notifying=%s ios_in_flush_generation=%llu", - (unsigned long long) READ_ONCE(zone->flush_generation), - (unsigned long long) READ_ONCE(zone->oldest_active_generation), - (unsigned long long) READ_ONCE(zone->notification_generation), - uds_bool_to_string(READ_ONCE(zone->notifying)), - (unsigned long long) READ_ONCE(zone->ios_in_flush_generation)); -} diff --git a/vdo/logicalZone.h b/vdo/logicalZone.h deleted file mode 100644 index 7c11880a..00000000 --- a/vdo/logicalZone.h +++ /dev/null @@ -1,179 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/logicalZone.h#10 $ - */ - -#ifndef LOGICAL_ZONE_H -#define LOGICAL_ZONE_H - -#include "adminState.h" -#include "intMap.h" -#include "types.h" - -/** - * Get a logical zone by number. - * - * @param zones A set of logical zones - * @param zone_number The number of the zone to get - * - * @return The requested zone - **/ -struct logical_zone * __must_check -get_vdo_logical_zone(struct logical_zones *zones, zone_count_t zone_number); - -/** - * Create a set of logical zones. - * - * @param [in] vdo The vdo to which the zones will belong - * @param [out] zones_ptr A pointer to hold the new zones - * - * @return VDO_SUCCESS or an error code - **/ -int __must_check -make_vdo_logical_zones(struct vdo *vdo, struct logical_zones **zones_ptr); - -/** - * Free a set of logical zones. - * - * @param zones The set of zones to free - **/ -void free_vdo_logical_zones(struct logical_zones *zones); - -/** - * Drain a set of logical zones. - * - * @param zones The logical zones to suspend - * @param operation The type of drain to perform - * @param completion The object to notify when the zones are suspended - **/ -void drain_vdo_logical_zones(struct logical_zones *zones, - const struct admin_state_code *operation, - struct vdo_completion *completion); - -/** - * Resume a set of logical zones. - * - * @param zones The logical zones to resume - * @param parent The object to notify when the zones have resumed - **/ -void resume_vdo_logical_zones(struct logical_zones *zones, - struct vdo_completion *parent); - -/** - * Get the ID of a logical zone's thread. - * - * @param zone The zone - * - * @return The zone's thread ID - **/ -thread_id_t __must_check -get_vdo_logical_zone_thread_id(const struct logical_zone *zone); - -/** - * Get the portion of the block map for this zone. - * - * @param zone The zone - * - * @return The block map zone - **/ -struct block_map_zone * __must_check -get_vdo_logical_zone_block_map(const struct logical_zone *zone); - -/** - * Get the logical lock map for this zone. - * - * @param zone The zone - * - * @return The logical lock map for the zone - **/ -struct int_map * __must_check -get_vdo_logical_zone_lbn_lock_map(const struct logical_zone *zone); - -/** - * Get the next-highest-numbered logical zone, or NULL if the - * zone is the highest-numbered zone in its vdo. - * - * @param zone The logical zone to query - * - * @return The logical zone whose zone number is one greater than the given - * zone, or NULL if there is no such zone - **/ -struct logical_zone * __must_check -get_next_vdo_logical_zone(const struct logical_zone *zone); - -/** - * Increment the flush generation in a logical zone. - * - * @param zone The logical zone - * @param expected_generation The expected value of the flush generation - * before the increment - **/ -void -increment_vdo_logical_zone_flush_generation(struct logical_zone *zone, - sequence_number_t expected_generation); - -/** - * Get the oldest flush generation which is locked by a logical zone. - * - * @param zone The logical zone - * - * @return The oldest generation locked by the zone - **/ -sequence_number_t __must_check -get_vdo_logical_zone_oldest_locked_generation(const struct logical_zone *zone); - -/** - * Acquire the shared lock on a flush generation by a write data_vio. - * - * @param data_vio The data_vio - * - * @return VDO_SUCCESS or an error code - **/ -int __must_check acquire_vdo_flush_generation_lock(struct data_vio *data_vio); - -/** - * Release the shared lock on a flush generation held by a write data_vio. If - * there are pending flushes, and this data_vio completes the oldest generation - * active in this zone, an attempt will be made to finish any flushes which may - * now be complete. - * - * @param data_vio The data_vio whose lock is to be released - **/ -void release_vdo_flush_generation_lock(struct data_vio *data_vio); - -/** - * Get the selector for deciding which physical zone should be allocated from - * next for activities in a logical zone. - * - * @param zone The logical zone of the operation which needs an allocation - * - * @return The allocation selector for this zone - **/ -struct allocation_selector * __must_check -get_vdo_logical_zone_allocation_selector(struct logical_zone *zone); - -/** - * Dump information about a logical zone to the log for debugging, in a - * thread-unsafe fashion. - * - * @param zone The zone to dump - **/ -void dump_vdo_logical_zone(const struct logical_zone *zone); - -#endif // LOGICAL_ZONE_H diff --git a/vdo/lz4_compress.c b/vdo/lz4_compress.c deleted file mode 100644 index 1b019067..00000000 --- a/vdo/lz4_compress.c +++ /dev/null @@ -1,931 +0,0 @@ -/* - * LZ4 - Fast LZ compression algorithm - * Copyright (C) 2011 - 2016, Yann Collet. - * BSD 2 - Clause License (http://www.opensource.org/licenses/bsd - license.php) - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are - * met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following disclaimer - * in the documentation and/or other materials provided with the - * distribution. - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * You can contact the author at : - * - LZ4 homepage : http://www.lz4.org - * - LZ4 source repository : https://github.com/lz4/lz4 - * - * Changed for kernel usage by: - * Sven Schmidt <4sschmid@informatik.uni-hamburg.de> - */ - -/*-************************************ - * Dependencies - **************************************/ -#include -#include "lz4defs.h" -#include -#include -#include - -static const int LZ4_minLength = (MFLIMIT + 1); -static const int LZ4_64Klimit = ((64 * KB) + (MFLIMIT - 1)); - -/*-****************************** - * Compression functions - ********************************/ -static FORCE_INLINE U32 LZ4_hash4( - U32 sequence, - tableType_t const tableType) -{ - if (tableType == byU16) - return ((sequence * 2654435761U) - >> ((MINMATCH * 8) - (LZ4_HASHLOG + 1))); - else - return ((sequence * 2654435761U) - >> ((MINMATCH * 8) - LZ4_HASHLOG)); -} - -static FORCE_INLINE U32 LZ4_hash5( - U64 sequence, - tableType_t const tableType) -{ - const U32 hashLog = (tableType == byU16) - ? LZ4_HASHLOG + 1 - : LZ4_HASHLOG; - -#if LZ4_LITTLE_ENDIAN - static const U64 prime5bytes = 889523592379ULL; - - return (U32)(((sequence << 24) * prime5bytes) >> (64 - hashLog)); -#else - static const U64 prime8bytes = 11400714785074694791ULL; - - return (U32)(((sequence >> 24) * prime8bytes) >> (64 - hashLog)); -#endif -} - -static FORCE_INLINE U32 LZ4_hashPosition( - const void *p, - tableType_t const tableType) -{ -#if LZ4_ARCH64 - if (tableType == byU32) - return LZ4_hash5(LZ4_read_ARCH(p), tableType); -#endif - - return LZ4_hash4(LZ4_read32(p), tableType); -} - -static void LZ4_putPositionOnHash( - const BYTE *p, - U32 h, - void *tableBase, - tableType_t const tableType, - const BYTE *srcBase) -{ - switch (tableType) { - case byPtr: - { - const BYTE **hashTable = (const BYTE **)tableBase; - - hashTable[h] = p; - return; - } - case byU32: - { - U32 *hashTable = (U32 *) tableBase; - - hashTable[h] = (U32)(p - srcBase); - return; - } - case byU16: - { - U16 *hashTable = (U16 *) tableBase; - - hashTable[h] = (U16)(p - srcBase); - return; - } - } -} - -static FORCE_INLINE void LZ4_putPosition( - const BYTE *p, - void *tableBase, - tableType_t tableType, - const BYTE *srcBase) -{ - U32 const h = LZ4_hashPosition(p, tableType); - - LZ4_putPositionOnHash(p, h, tableBase, tableType, srcBase); -} - -static const BYTE *LZ4_getPositionOnHash( - U32 h, - void *tableBase, - tableType_t tableType, - const BYTE *srcBase) -{ - if (tableType == byPtr) { - const BYTE **hashTable = (const BYTE **) tableBase; - - return hashTable[h]; - } - - if (tableType == byU32) { - const U32 * const hashTable = (U32 *) tableBase; - - return hashTable[h] + srcBase; - } - - { - /* default, to ensure a return */ - const U16 * const hashTable = (U16 *) tableBase; - - return hashTable[h] + srcBase; - } -} - -static FORCE_INLINE const BYTE *LZ4_getPosition( - const BYTE *p, - void *tableBase, - tableType_t tableType, - const BYTE *srcBase) -{ - U32 const h = LZ4_hashPosition(p, tableType); - - return LZ4_getPositionOnHash(h, tableBase, tableType, srcBase); -} - - -/* - * LZ4_compress_generic() : - * inlined, to ensure branches are decided at compilation time - */ -static FORCE_INLINE int LZ4_compress_generic( - LZ4_stream_t_internal * const dictPtr, - const char * const source, - char * const dest, - const int inputSize, - const int maxOutputSize, - const limitedOutput_directive outputLimited, - const tableType_t tableType, - const dict_directive dict, - const dictIssue_directive dictIssue, - const U32 acceleration) -{ - const BYTE *ip = (const BYTE *) source; - const BYTE *base; - const BYTE *lowLimit; - const BYTE * const lowRefLimit = ip - dictPtr->dictSize; - const BYTE * const dictionary = dictPtr->dictionary; - const BYTE * const dictEnd = dictionary + dictPtr->dictSize; - const size_t dictDelta = dictEnd - (const BYTE *)source; - const BYTE *anchor = (const BYTE *) source; - const BYTE * const iend = ip + inputSize; - const BYTE * const mflimit = iend - MFLIMIT; - const BYTE * const matchlimit = iend - LASTLITERALS; - - BYTE *op = (BYTE *) dest; - BYTE * const olimit = op + maxOutputSize; - - U32 forwardH; - size_t refDelta = 0; - - /* Init conditions */ - if ((U32)inputSize > (U32)LZ4_MAX_INPUT_SIZE) { - /* Unsupported inputSize, too large (or negative) */ - return 0; - } - - switch (dict) { - case noDict: - default: - base = (const BYTE *)source; - lowLimit = (const BYTE *)source; - break; - case withPrefix64k: - base = (const BYTE *)source - dictPtr->currentOffset; - lowLimit = (const BYTE *)source - dictPtr->dictSize; - break; - case usingExtDict: - base = (const BYTE *)source - dictPtr->currentOffset; - lowLimit = (const BYTE *)source; - break; - } - - if ((tableType == byU16) - && (inputSize >= LZ4_64Klimit)) { - /* Size too large (not within 64K limit) */ - return 0; - } - - if (inputSize < LZ4_minLength) { - /* Input too small, no compression (all literals) */ - goto _last_literals; - } - - /* First Byte */ - LZ4_putPosition(ip, dictPtr->hashTable, tableType, base); - ip++; - forwardH = LZ4_hashPosition(ip, tableType); - - /* Main Loop */ - for ( ; ; ) { - const BYTE *match; - BYTE *token; - - /* Find a match */ - { - const BYTE *forwardIp = ip; - unsigned int step = 1; - unsigned int searchMatchNb = acceleration << LZ4_SKIPTRIGGER; - - do { - U32 const h = forwardH; - - ip = forwardIp; - forwardIp += step; - step = (searchMatchNb++ >> LZ4_SKIPTRIGGER); - - if (unlikely(forwardIp > mflimit)) - goto _last_literals; - - match = LZ4_getPositionOnHash(h, - dictPtr->hashTable, - tableType, base); - - if (dict == usingExtDict) { - if (match < (const BYTE *)source) { - refDelta = dictDelta; - lowLimit = dictionary; - } else { - refDelta = 0; - lowLimit = (const BYTE *)source; - } } - - forwardH = LZ4_hashPosition(forwardIp, - tableType); - - LZ4_putPositionOnHash(ip, h, dictPtr->hashTable, - tableType, base); - } while (((dictIssue == dictSmall) - ? (match < lowRefLimit) - : 0) - || ((tableType == byU16) - ? 0 - : (match + MAX_DISTANCE < ip)) - || (LZ4_read32(match + refDelta) - != LZ4_read32(ip))); - } - - /* Catch up */ - while (((ip > anchor) & (match + refDelta > lowLimit)) - && (unlikely(ip[-1] == match[refDelta - 1]))) { - ip--; - match--; - } - - /* Encode Literals */ - { - unsigned const int litLength = (unsigned int)(ip - anchor); - - token = op++; - - if ((outputLimited) && - /* Check output buffer overflow */ - (unlikely(op + litLength + - (2 + 1 + LASTLITERALS) + - (litLength / 255) > olimit))) - return 0; - - if (litLength >= RUN_MASK) { - int len = (int)litLength - RUN_MASK; - - *token = (RUN_MASK << ML_BITS); - - for (; len >= 255; len -= 255) - *op++ = 255; - *op++ = (BYTE)len; - } else - *token = (BYTE)(litLength << ML_BITS); - - /* Copy Literals */ - LZ4_wildCopy(op, anchor, op + litLength); - op += litLength; - } - -_next_match: - /* Encode Offset */ - LZ4_writeLE16(op, (U16)(ip - match)); - op += 2; - - /* Encode MatchLength */ - { - unsigned int matchCode; - - if ((dict == usingExtDict) - && (lowLimit == dictionary)) { - const BYTE *limit; - - match += refDelta; - limit = ip + (dictEnd - match); - - if (limit > matchlimit) - limit = matchlimit; - - matchCode = LZ4_count(ip + MINMATCH, - match + MINMATCH, limit); - - ip += MINMATCH + matchCode; - - if (ip == limit) { - unsigned const int more = LZ4_count(ip, - (const BYTE *)source, - matchlimit); - - matchCode += more; - ip += more; - } - } else { - matchCode = LZ4_count(ip + MINMATCH, - match + MINMATCH, matchlimit); - ip += MINMATCH + matchCode; - } - - if (outputLimited && - /* Check output buffer overflow */ - (unlikely(op + - (1 + LASTLITERALS) + - (matchCode >> 8) > olimit))) - return 0; - - if (matchCode >= ML_MASK) { - *token += ML_MASK; - matchCode -= ML_MASK; - LZ4_write32(op, 0xFFFFFFFF); - - while (matchCode >= 4 * 255) { - op += 4; - LZ4_write32(op, 0xFFFFFFFF); - matchCode -= 4 * 255; - } - - op += matchCode / 255; - *op++ = (BYTE)(matchCode % 255); - } else - *token += (BYTE)(matchCode); - } - - anchor = ip; - - /* Test end of chunk */ - if (ip > mflimit) - break; - - /* Fill table */ - LZ4_putPosition(ip - 2, dictPtr->hashTable, tableType, base); - - /* Test next position */ - match = LZ4_getPosition(ip, dictPtr->hashTable, - tableType, base); - - if (dict == usingExtDict) { - if (match < (const BYTE *)source) { - refDelta = dictDelta; - lowLimit = dictionary; - } else { - refDelta = 0; - lowLimit = (const BYTE *)source; - } - } - - LZ4_putPosition(ip, dictPtr->hashTable, tableType, base); - - if (((dictIssue == dictSmall) ? (match >= lowRefLimit) : 1) - && (match + MAX_DISTANCE >= ip) - && (LZ4_read32(match + refDelta) == LZ4_read32(ip))) { - token = op++; - *token = 0; - goto _next_match; - } - - /* Prepare next loop */ - forwardH = LZ4_hashPosition(++ip, tableType); - } - -_last_literals: - /* Encode Last Literals */ - { - size_t const lastRun = (size_t)(iend - anchor); - - if ((outputLimited) && - /* Check output buffer overflow */ - ((op - (BYTE *)dest) + lastRun + 1 + - ((lastRun + 255 - RUN_MASK) / 255) > (U32)maxOutputSize)) - return 0; - - if (lastRun >= RUN_MASK) { - size_t accumulator = lastRun - RUN_MASK; - *op++ = RUN_MASK << ML_BITS; - for (; accumulator >= 255; accumulator -= 255) - *op++ = 255; - *op++ = (BYTE) accumulator; - } else { - *op++ = (BYTE)(lastRun << ML_BITS); - } - - memcpy(op, anchor, lastRun); - - op += lastRun; - } - - /* End */ - return (int) (((char *)op) - dest); -} - -static int LZ4_compress_fast_extState( - void *state, - const char *source, - char *dest, - int inputSize, - int maxOutputSize, - int acceleration) -{ - LZ4_stream_t_internal *ctx = &((LZ4_stream_t *)state)->internal_donotuse; -#if LZ4_ARCH64 - const tableType_t tableType = byU32; -#else - const tableType_t tableType = byPtr; -#endif - - LZ4_resetStream((LZ4_stream_t *)state); - - if (acceleration < 1) - acceleration = LZ4_ACCELERATION_DEFAULT; - - if (maxOutputSize >= LZ4_COMPRESSBOUND(inputSize)) { - if (inputSize < LZ4_64Klimit) - return LZ4_compress_generic(ctx, source, - dest, inputSize, 0, - noLimit, byU16, noDict, - noDictIssue, acceleration); - else - return LZ4_compress_generic(ctx, source, - dest, inputSize, 0, - noLimit, tableType, noDict, - noDictIssue, acceleration); - } else { - if (inputSize < LZ4_64Klimit) - return LZ4_compress_generic(ctx, source, - dest, inputSize, - maxOutputSize, limitedOutput, byU16, noDict, - noDictIssue, acceleration); - else - return LZ4_compress_generic(ctx, source, - dest, inputSize, - maxOutputSize, limitedOutput, tableType, noDict, - noDictIssue, acceleration); - } -} - -int LZ4_compress_fast(const char *source, char *dest, int inputSize, - int maxOutputSize, int acceleration, void *wrkmem) -{ - return LZ4_compress_fast_extState(wrkmem, source, dest, inputSize, - maxOutputSize, acceleration); -} - -int LZ4_compress_default(const char *source, char *dest, int inputSize, - int maxOutputSize, void *wrkmem) -{ - return LZ4_compress_fast(source, dest, inputSize, - maxOutputSize, LZ4_ACCELERATION_DEFAULT, wrkmem); -} - -/*-****************************** - * *_destSize() variant - ********************************/ -static int LZ4_compress_destSize_generic( - LZ4_stream_t_internal * const ctx, - const char * const src, - char * const dst, - int * const srcSizePtr, - const int targetDstSize, - const tableType_t tableType) -{ - const BYTE *ip = (const BYTE *) src; - const BYTE *base = (const BYTE *) src; - const BYTE *lowLimit = (const BYTE *) src; - const BYTE *anchor = ip; - const BYTE * const iend = ip + *srcSizePtr; - const BYTE * const mflimit = iend - MFLIMIT; - const BYTE * const matchlimit = iend - LASTLITERALS; - - BYTE *op = (BYTE *) dst; - BYTE * const oend = op + targetDstSize; - BYTE * const oMaxLit = op + targetDstSize - 2 /* offset */ - - 8 /* because 8 + MINMATCH == MFLIMIT */ - 1 /* token */; - BYTE * const oMaxMatch = op + targetDstSize - - (LASTLITERALS + 1 /* token */); - BYTE * const oMaxSeq = oMaxLit - 1 /* token */; - - U32 forwardH; - - /* Init conditions */ - /* Impossible to store anything */ - if (targetDstSize < 1) - return 0; - /* Unsupported input size, too large (or negative) */ - if ((U32)*srcSizePtr > (U32)LZ4_MAX_INPUT_SIZE) - return 0; - /* Size too large (not within 64K limit) */ - if ((tableType == byU16) && (*srcSizePtr >= LZ4_64Klimit)) - return 0; - /* Input too small, no compression (all literals) */ - if (*srcSizePtr < LZ4_minLength) - goto _last_literals; - - /* First Byte */ - *srcSizePtr = 0; - LZ4_putPosition(ip, ctx->hashTable, tableType, base); - ip++; forwardH = LZ4_hashPosition(ip, tableType); - - /* Main Loop */ - for ( ; ; ) { - const BYTE *match; - BYTE *token; - - /* Find a match */ - { - const BYTE *forwardIp = ip; - unsigned int step = 1; - unsigned int searchMatchNb = 1 << LZ4_SKIPTRIGGER; - - do { - U32 h = forwardH; - - ip = forwardIp; - forwardIp += step; - step = (searchMatchNb++ >> LZ4_SKIPTRIGGER); - - if (unlikely(forwardIp > mflimit)) - goto _last_literals; - - match = LZ4_getPositionOnHash(h, ctx->hashTable, - tableType, base); - forwardH = LZ4_hashPosition(forwardIp, - tableType); - LZ4_putPositionOnHash(ip, h, - ctx->hashTable, tableType, - base); - - } while (((tableType == byU16) - ? 0 - : (match + MAX_DISTANCE < ip)) - || (LZ4_read32(match) != LZ4_read32(ip))); - } - - /* Catch up */ - while ((ip > anchor) - && (match > lowLimit) - && (unlikely(ip[-1] == match[-1]))) { - ip--; - match--; - } - - /* Encode Literal length */ - { - unsigned int litLength = (unsigned int)(ip - anchor); - - token = op++; - if (op + ((litLength + 240) / 255) - + litLength > oMaxLit) { - /* Not enough space for a last match */ - op--; - goto _last_literals; - } - if (litLength >= RUN_MASK) { - unsigned int len = litLength - RUN_MASK; - *token = (RUN_MASK<= 255; len -= 255) - *op++ = 255; - *op++ = (BYTE)len; - } else - *token = (BYTE)(litLength << ML_BITS); - - /* Copy Literals */ - LZ4_wildCopy(op, anchor, op + litLength); - op += litLength; - } - -_next_match: - /* Encode Offset */ - LZ4_writeLE16(op, (U16)(ip - match)); op += 2; - - /* Encode MatchLength */ - { - size_t matchLength = LZ4_count(ip + MINMATCH, - match + MINMATCH, matchlimit); - - if (op + ((matchLength + 240)/255) > oMaxMatch) { - /* Match description too long : reduce it */ - matchLength = (15 - 1) + (oMaxMatch - op) * 255; - } - ip += MINMATCH + matchLength; - - if (matchLength >= ML_MASK) { - *token += ML_MASK; - matchLength -= ML_MASK; - while (matchLength >= 255) { - matchLength -= 255; - *op++ = 255; - } - *op++ = (BYTE)matchLength; - } else - *token += (BYTE)(matchLength); - } - - anchor = ip; - - /* Test end of block */ - if (ip > mflimit) - break; - if (op > oMaxSeq) - break; - - /* Fill table */ - LZ4_putPosition(ip - 2, ctx->hashTable, tableType, base); - - /* Test next position */ - match = LZ4_getPosition(ip, ctx->hashTable, tableType, base); - LZ4_putPosition(ip, ctx->hashTable, tableType, base); - - if ((match + MAX_DISTANCE >= ip) - && (LZ4_read32(match) == LZ4_read32(ip))) { - token = op++; *token = 0; - goto _next_match; - } - - /* Prepare next loop */ - forwardH = LZ4_hashPosition(++ip, tableType); - } - -_last_literals: - /* Encode Last Literals */ - { - size_t lastRunSize = (size_t)(iend - anchor); - - if (op + 1 /* token */ - + ((lastRunSize + 240) / 255) /* litLength */ - + lastRunSize /* literals */ > oend) { - /* adapt lastRunSize to fill 'dst' */ - lastRunSize = (oend - op) - 1; - lastRunSize -= (lastRunSize + 240) / 255; - } - ip = anchor + lastRunSize; - - if (lastRunSize >= RUN_MASK) { - size_t accumulator = lastRunSize - RUN_MASK; - - *op++ = RUN_MASK << ML_BITS; - for (; accumulator >= 255; accumulator -= 255) - *op++ = 255; - *op++ = (BYTE) accumulator; - } else { - *op++ = (BYTE)(lastRunSize<= LZ4_COMPRESSBOUND(*srcSizePtr)) { - /* compression success is guaranteed */ - return LZ4_compress_fast_extState( - state, src, dst, *srcSizePtr, - targetDstSize, 1); - } else { - if (*srcSizePtr < LZ4_64Klimit) - return LZ4_compress_destSize_generic( - &state->internal_donotuse, - src, dst, srcSizePtr, - targetDstSize, byU16); - else - return LZ4_compress_destSize_generic( - &state->internal_donotuse, - src, dst, srcSizePtr, - targetDstSize, tableType); - } -} - - -int LZ4_compress_destSize( - const char *src, - char *dst, - int *srcSizePtr, - int targetDstSize, - void *wrkmem) -{ - return LZ4_compress_destSize_extState(wrkmem, src, dst, srcSizePtr, - targetDstSize); -} - -/*-****************************** - * Streaming functions - ********************************/ -void LZ4_resetStream(LZ4_stream_t *LZ4_stream) -{ - memset(LZ4_stream, 0, sizeof(LZ4_stream_t)); -} - -int LZ4_loadDict(LZ4_stream_t *LZ4_dict, - const char *dictionary, int dictSize) -{ - LZ4_stream_t_internal *dict = &LZ4_dict->internal_donotuse; - const BYTE *p = (const BYTE *)dictionary; - const BYTE * const dictEnd = p + dictSize; - const BYTE *base; - - if ((dict->initCheck) - || (dict->currentOffset > 1 * GB)) { - /* Uninitialized structure, or reuse overflow */ - LZ4_resetStream(LZ4_dict); - } - - if (dictSize < (int)HASH_UNIT) { - dict->dictionary = NULL; - dict->dictSize = 0; - return 0; - } - - if ((dictEnd - p) > 64 * KB) - p = dictEnd - 64 * KB; - dict->currentOffset += 64 * KB; - base = p - dict->currentOffset; - dict->dictionary = p; - dict->dictSize = (U32)(dictEnd - p); - dict->currentOffset += dict->dictSize; - - while (p <= dictEnd - HASH_UNIT) { - LZ4_putPosition(p, dict->hashTable, byU32, base); - p += 3; - } - - return dict->dictSize; -} - -static void LZ4_renormDictT(LZ4_stream_t_internal *LZ4_dict, - const BYTE *src) -{ - if ((LZ4_dict->currentOffset > 0x80000000) || - ((uptrval)LZ4_dict->currentOffset > (uptrval)src)) { - /* address space overflow */ - /* rescale hash table */ - U32 const delta = LZ4_dict->currentOffset - 64 * KB; - const BYTE *dictEnd = LZ4_dict->dictionary + LZ4_dict->dictSize; - int i; - - for (i = 0; i < LZ4_HASH_SIZE_U32; i++) { - if (LZ4_dict->hashTable[i] < delta) - LZ4_dict->hashTable[i] = 0; - else - LZ4_dict->hashTable[i] -= delta; - } - LZ4_dict->currentOffset = 64 * KB; - if (LZ4_dict->dictSize > 64 * KB) - LZ4_dict->dictSize = 64 * KB; - LZ4_dict->dictionary = dictEnd - LZ4_dict->dictSize; - } -} - -int LZ4_saveDict(LZ4_stream_t *LZ4_dict, char *safeBuffer, int dictSize) -{ - LZ4_stream_t_internal * const dict = &LZ4_dict->internal_donotuse; - const BYTE * const previousDictEnd = dict->dictionary + dict->dictSize; - - if ((U32)dictSize > 64 * KB) { - /* useless to define a dictionary > 64 * KB */ - dictSize = 64 * KB; - } - if ((U32)dictSize > dict->dictSize) - dictSize = dict->dictSize; - - memmove(safeBuffer, previousDictEnd - dictSize, dictSize); - - dict->dictionary = (const BYTE *)safeBuffer; - dict->dictSize = (U32)dictSize; - - return dictSize; -} - -int LZ4_compress_fast_continue(LZ4_stream_t *LZ4_stream, const char *source, - char *dest, int inputSize, int maxOutputSize, int acceleration) -{ - LZ4_stream_t_internal *streamPtr = &LZ4_stream->internal_donotuse; - const BYTE * const dictEnd = streamPtr->dictionary - + streamPtr->dictSize; - - const BYTE *smallest = (const BYTE *) source; - - if (streamPtr->initCheck) { - /* Uninitialized structure detected */ - return 0; - } - - if ((streamPtr->dictSize > 0) && (smallest > dictEnd)) - smallest = dictEnd; - - LZ4_renormDictT(streamPtr, smallest); - - if (acceleration < 1) - acceleration = LZ4_ACCELERATION_DEFAULT; - - /* Check overlapping input/dictionary space */ - { - const BYTE *sourceEnd = (const BYTE *) source + inputSize; - - if ((sourceEnd > streamPtr->dictionary) - && (sourceEnd < dictEnd)) { - streamPtr->dictSize = (U32)(dictEnd - sourceEnd); - if (streamPtr->dictSize > 64 * KB) - streamPtr->dictSize = 64 * KB; - if (streamPtr->dictSize < 4) - streamPtr->dictSize = 0; - streamPtr->dictionary = dictEnd - streamPtr->dictSize; - } - } - - /* prefix mode : source data follows dictionary */ - if (dictEnd == (const BYTE *)source) { - int result; - - if ((streamPtr->dictSize < 64 * KB) && - (streamPtr->dictSize < streamPtr->currentOffset)) { - result = LZ4_compress_generic( - streamPtr, source, dest, inputSize, - maxOutputSize, limitedOutput, byU32, - withPrefix64k, dictSmall, acceleration); - } else { - result = LZ4_compress_generic( - streamPtr, source, dest, inputSize, - maxOutputSize, limitedOutput, byU32, - withPrefix64k, noDictIssue, acceleration); - } - streamPtr->dictSize += (U32)inputSize; - streamPtr->currentOffset += (U32)inputSize; - return result; - } - - /* external dictionary mode */ - { - int result; - - if ((streamPtr->dictSize < 64 * KB) && - (streamPtr->dictSize < streamPtr->currentOffset)) { - result = LZ4_compress_generic( - streamPtr, source, dest, inputSize, - maxOutputSize, limitedOutput, byU32, - usingExtDict, dictSmall, acceleration); - } else { - result = LZ4_compress_generic( - streamPtr, source, dest, inputSize, - maxOutputSize, limitedOutput, byU32, - usingExtDict, noDictIssue, acceleration); - } - streamPtr->dictionary = (const BYTE *)source; - streamPtr->dictSize = (U32)inputSize; - streamPtr->currentOffset += (U32)inputSize; - return result; - } -} diff --git a/vdo/lz4defs.h b/vdo/lz4defs.h deleted file mode 100644 index 1a7fa9d9..00000000 --- a/vdo/lz4defs.h +++ /dev/null @@ -1,234 +0,0 @@ -#ifndef __LZ4DEFS_H__ -#define __LZ4DEFS_H__ - -/* - * lz4defs.h -- common and architecture specific defines for the kernel usage - - * LZ4 - Fast LZ compression algorithm - * Copyright (C) 2011-2016, Yann Collet. - * BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are - * met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following disclaimer - * in the documentation and/or other materials provided with the - * distribution. - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * You can contact the author at : - * - LZ4 homepage : http://www.lz4.org - * - LZ4 source repository : https://github.com/lz4/lz4 - * - * Changed for kernel usage by: - * Sven Schmidt <4sschmid@informatik.uni-hamburg.de> - */ - -#include -#include /* memset, memcpy */ - -#define FORCE_INLINE __always_inline - -/*-************************************ - * Basic Types - **************************************/ -#include - -typedef uint8_t BYTE; -typedef uint16_t U16; -typedef uint32_t U32; -typedef int32_t S32; -typedef uint64_t U64; -typedef uintptr_t uptrval; - -/*-************************************ - * Architecture specifics - **************************************/ -#if defined(CONFIG_64BIT) -#define LZ4_ARCH64 1 -#else -#define LZ4_ARCH64 0 -#endif - -#if defined(__LITTLE_ENDIAN) -#define LZ4_LITTLE_ENDIAN 1 -#else -#define LZ4_LITTLE_ENDIAN 0 -#endif - -/*-************************************ - * Constants - **************************************/ -#define MINMATCH 4 - -#define WILDCOPYLENGTH 8 -#define LASTLITERALS 5 -#define MFLIMIT (WILDCOPYLENGTH + MINMATCH) -/* - * ensure it's possible to write 2 x wildcopyLength - * without overflowing output buffer - */ -#define MATCH_SAFEGUARD_DISTANCE ((2 * WILDCOPYLENGTH) - MINMATCH) - -/* Increase this value ==> compression run slower on incompressible data */ -#define LZ4_SKIPTRIGGER 6 - -#define HASH_UNIT sizeof(size_t) - -#define KB (1 << 10) -#define MB (1 << 20) -#define GB (1U << 30) - -#define MAXD_LOG 16 -#define MAX_DISTANCE ((1 << MAXD_LOG) - 1) -#define STEPSIZE sizeof(size_t) - -#define ML_BITS 4 -#define ML_MASK ((1U << ML_BITS) - 1) -#define RUN_BITS (8 - ML_BITS) -#define RUN_MASK ((1U << RUN_BITS) - 1) - -/*-************************************ - * Reading and writing into memory - **************************************/ -static FORCE_INLINE U16 LZ4_read16(const void *ptr) -{ - return get_unaligned((const U16 *)ptr); -} - -static FORCE_INLINE U32 LZ4_read32(const void *ptr) -{ - return get_unaligned((const U32 *)ptr); -} - -static FORCE_INLINE size_t LZ4_read_ARCH(const void *ptr) -{ - return get_unaligned((const size_t *)ptr); -} - -static FORCE_INLINE void LZ4_write16(void *memPtr, U16 value) -{ - put_unaligned(value, (U16 *)memPtr); -} - -static FORCE_INLINE void LZ4_write32(void *memPtr, U32 value) -{ - put_unaligned(value, (U32 *)memPtr); -} - -static FORCE_INLINE U16 LZ4_readLE16(const void *memPtr) -{ - return get_unaligned_le16(memPtr); -} - -static FORCE_INLINE void LZ4_writeLE16(void *memPtr, U16 value) -{ - return put_unaligned_le16(value, memPtr); -} - -static FORCE_INLINE void LZ4_copy8(void *dst, const void *src) -{ -#if LZ4_ARCH64 - U64 a = get_unaligned((const U64 *)src); - - put_unaligned(a, (U64 *)dst); -#else - U32 a = get_unaligned((const U32 *)src); - U32 b = get_unaligned((const U32 *)src + 1); - - put_unaligned(a, (U32 *)dst); - put_unaligned(b, (U32 *)dst + 1); -#endif -} - -/* - * customized variant of memcpy, - * which can overwrite up to 7 bytes beyond dstEnd - */ -static FORCE_INLINE void LZ4_wildCopy(void *dstPtr, - const void *srcPtr, void *dstEnd) -{ - BYTE *d = (BYTE *)dstPtr; - const BYTE *s = (const BYTE *)srcPtr; - BYTE *const e = (BYTE *)dstEnd; - - do { - LZ4_copy8(d, s); - d += 8; - s += 8; - } while (d < e); -} - -static FORCE_INLINE unsigned int LZ4_NbCommonBytes(register size_t val) -{ -#if LZ4_LITTLE_ENDIAN - return __ffs(val) >> 3; -#else - return (BITS_PER_LONG - 1 - __fls(val)) >> 3; -#endif -} - -static FORCE_INLINE unsigned int LZ4_count( - const BYTE *pIn, - const BYTE *pMatch, - const BYTE *pInLimit) -{ - const BYTE *const pStart = pIn; - - while (likely(pIn < pInLimit - (STEPSIZE - 1))) { - size_t const diff = LZ4_read_ARCH(pMatch) ^ LZ4_read_ARCH(pIn); - - if (!diff) { - pIn += STEPSIZE; - pMatch += STEPSIZE; - continue; - } - - pIn += LZ4_NbCommonBytes(diff); - - return (unsigned int)(pIn - pStart); - } - -#if LZ4_ARCH64 - if ((pIn < (pInLimit - 3)) - && (LZ4_read32(pMatch) == LZ4_read32(pIn))) { - pIn += 4; - pMatch += 4; - } -#endif - - if ((pIn < (pInLimit - 1)) - && (LZ4_read16(pMatch) == LZ4_read16(pIn))) { - pIn += 2; - pMatch += 2; - } - - if ((pIn < pInLimit) && (*pMatch == *pIn)) - pIn++; - - return (unsigned int)(pIn - pStart); -} - -typedef enum { noLimit = 0, limitedOutput = 1 } limitedOutput_directive; -typedef enum { byPtr, byU32, byU16 } tableType_t; - -typedef enum { noDict = 0, withPrefix64k, usingExtDict } dict_directive; -typedef enum { noDictIssue = 0, dictSmall } dictIssue_directive; - -typedef enum { endOnOutputSize = 0, endOnInputSize = 1 } endCondition_directive; -typedef enum { decode_full_block = 0, partial_decode = 1 } earlyEnd_directive; - -#define LZ4_STATIC_ASSERT(c) BUILD_BUG_ON(!(c)) - -#endif diff --git a/uds/memoryLinuxKernel.c b/vdo/memory-alloc.c similarity index 73% rename from uds/memoryLinuxKernel.c rename to vdo/memory-alloc.c index b8e86b24..90988833 100644 --- a/uds/memoryLinuxKernel.c +++ b/vdo/memory-alloc.c @@ -1,39 +1,21 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/krusty/kernelLinux/uds/memoryLinuxKernel.c#31 $ */ #include #include #include #include -#include #include #include "compiler.h" #include "logger.h" -#include "memoryAlloc.h" +#include "memory-alloc.h" #include "permassert.h" /* - ****************************************************************************** * Production: UDS and VDO keep track of which threads are allowed to allocate * memory freely, and which threads must be careful to not do a memory * allocation that does an I/O request. The allocating_threads ThreadsRegistry @@ -42,32 +24,30 @@ static struct thread_registry allocating_threads; -/**********************************************************************/ static bool allocations_allowed(void) { const bool *pointer = uds_lookup_thread(&allocating_threads); + return pointer != NULL ? *pointer : false; } -/**********************************************************************/ void uds_register_allocating_thread(struct registered_thread *new_thread, const bool *flag_ptr) { if (flag_ptr == NULL) { static const bool allocation_always_allowed = true; + flag_ptr = &allocation_always_allowed; } uds_register_thread(&allocating_threads, new_thread, flag_ptr); } -/**********************************************************************/ void uds_unregister_allocating_thread(void) { uds_unregister_thread(&allocating_threads); } /* - ****************************************************************************** * Production: We track how much memory has been allocated and freed. When we * unload the UDS module, we log an error if we have not freed all the memory * that we allocated. Nearly all memory allocation and freeing is done using @@ -80,8 +60,10 @@ void uds_unregister_allocating_thread(void) * used. */ -// We allocate very few large objects, and allocation/deallocation isn't done -// in a performance-critical stage for us, so a linked list should be fine. +/* + * We allocate very few large objects, and allocation/deallocation isn't done + * in a performance-critical stage for us, so a linked list should be fine. + */ struct vmalloc_block_info { void *ptr; size_t size; @@ -98,7 +80,6 @@ static struct { struct vmalloc_block_info *vmalloc_list; } memory_stats __cacheline_aligned; -/**********************************************************************/ static void update_peak_usage(void) { size_t total_bytes = @@ -108,10 +89,10 @@ static void update_peak_usage(void) } } -/**********************************************************************/ static void add_kmalloc_block(size_t size) { unsigned long flags; + spin_lock_irqsave(&memory_stats.lock, flags); memory_stats.kmalloc_blocks++; memory_stats.kmalloc_bytes += size; @@ -119,20 +100,20 @@ static void add_kmalloc_block(size_t size) spin_unlock_irqrestore(&memory_stats.lock, flags); } -/**********************************************************************/ static void remove_kmalloc_block(size_t size) { unsigned long flags; + spin_lock_irqsave(&memory_stats.lock, flags); memory_stats.kmalloc_blocks--; memory_stats.kmalloc_bytes -= size; spin_unlock_irqrestore(&memory_stats.lock, flags); } -/**********************************************************************/ static void add_vmalloc_block(struct vmalloc_block_info *block) { unsigned long flags; + spin_lock_irqsave(&memory_stats.lock, flags); block->next = memory_stats.vmalloc_list; memory_stats.vmalloc_list = block; @@ -142,11 +123,11 @@ static void add_vmalloc_block(struct vmalloc_block_info *block) spin_unlock_irqrestore(&memory_stats.lock, flags); } -/**********************************************************************/ static void remove_vmalloc_block(void *ptr) { struct vmalloc_block_info *block, **block_ptr; unsigned long flags; + spin_lock_irqsave(&memory_stats.lock, flags); for (block_ptr = &memory_stats.vmalloc_list; (block = *block_ptr) != NULL; @@ -170,7 +151,8 @@ static void remove_vmalloc_block(void *ptr) /** - * Determine whether allocating a memory block should use kmalloc or vmalloc. + * Determine whether allocating a memory block should use kmalloc or + * __vmalloc. * * vmalloc can allocate any integral number of pages. * @@ -207,7 +189,6 @@ static INLINE bool use_kmalloc(size_t size) return size <= PAGE_SIZE; } -/**********************************************************************/ int uds_allocate_memory(size_t size, size_t align, const char *what, void *ptr) { /* @@ -249,17 +230,12 @@ int uds_allocate_memory(size_t size, size_t align, const char *what, void *ptr) p = kmalloc(size, gfp_flags | __GFP_NOWARN); if (p == NULL) { /* - * If we had just done kmalloc(size, gfp_flags) it is - * possible that the allocation would fail (see - * VDO-3688). The kernel log would then contain a long - * report about the failure. Although the failure - * occurs because there is no page available to - * allocate, by the time it logs the available space, - * there is a page available. So hopefully a short - * sleep will allow the page reclaimer to free a single - * page, which is all that we need. + * It is possible for kmalloc to fail to allocate + * memory because there is no page available (see + * VDO-3688). A short sleep may allow the page + * reclaimer to free a page. */ - msleep(1); + fsleep(1000); p = kmalloc(size, gfp_flags); } if (p != NULL) { @@ -267,19 +243,15 @@ int uds_allocate_memory(size_t size, size_t align, const char *what, void *ptr) } } else { struct vmalloc_block_info *block; + if (UDS_ALLOCATE(1, struct vmalloc_block_info, __func__, &block) == UDS_SUCCESS) { /* - * If we just do __vmalloc(size, gfp_flags, - * PAGE_KERNEL) it is possible that the allocation will - * fail (see VDO-3661). The kernel log will then - * contain a long report about the failure. Although - * the failure occurs because there are not enough - * pages available to allocate, by the time it logs the - * available space, there may enough pages available - * for smaller allocations. So hopefully a short sleep - * will allow the page reclaimer to free enough pages - * for us. + * It is possible for __vmalloc to fail to allocate + * memory because there are no pages available (see + * VDO-3661). A short sleep may allow the page + * reclaimer to free enough pages for a small + * allocation. * * For larger allocations, the kernel page_alloc code * is racing against the page reclaimer. If the page @@ -289,31 +261,24 @@ int uds_allocate_memory(size_t size, size_t align, const char *what, void *ptr) * possible that more retries will succeed. */ for (;;) { -// XXX Take out when all Fedora lab machines have upgraded to 5.8+. ALB-3032. -#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 8, 0) p = __vmalloc(size, gfp_flags | __GFP_NOWARN); -#else - p = __vmalloc(size, - gfp_flags | __GFP_NOWARN, - PAGE_KERNEL); -#endif - // Try again unless we succeeded or more than 1 - // second has elapsed. + /* + * Try again unless we succeeded or more than 1 + * second has elapsed. + */ if ((p != NULL) || (jiffies_to_msecs(jiffies - start_time) > 1000)) { break; } - msleep(1); + fsleep(1000); } if (p == NULL) { - // Try one more time, logging a failure for - // this call. -#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 8, 0) + /* + * Try one more time, logging a failure for + * this call. + */ p = __vmalloc(size, gfp_flags); -#else - p = __vmalloc(size, gfp_flags, PAGE_KERNEL); -#endif } if (p == NULL) { UDS_FREE(block); @@ -331,6 +296,7 @@ int uds_allocate_memory(size_t size, size_t align, const char *what, void *ptr) if (p == NULL) { unsigned int duration = jiffies_to_msecs(jiffies - start_time); + uds_log_error("Could not allocate %zu bytes for %s in %u msecs", size, what, @@ -341,18 +307,17 @@ int uds_allocate_memory(size_t size, size_t align, const char *what, void *ptr) return UDS_SUCCESS; } -/**********************************************************************/ void *uds_allocate_memory_nowait(size_t size, - const char *what __attribute__((unused))) + const char *what __maybe_unused) { void *p = kmalloc(size, GFP_NOWAIT | __GFP_ZERO); + if (p != NULL) { add_kmalloc_block(ksize(p)); } return p; } -/**********************************************************************/ void uds_free_memory(void *ptr) { if (ptr != NULL) { @@ -366,7 +331,6 @@ void uds_free_memory(void *ptr) } } -/**********************************************************************/ int uds_reallocate_memory(void *ptr, size_t old_size, size_t size, @@ -374,7 +338,7 @@ int uds_reallocate_memory(void *ptr, void *new_ptr) { int result; - // Handle special case of zero sized result + /* Handle special case of zero sized result */ if (size == 0) { UDS_FREE(ptr); *(void **) new_ptr = NULL; @@ -396,7 +360,22 @@ int uds_reallocate_memory(void *ptr, return UDS_SUCCESS; } -/**********************************************************************/ +int uds_duplicate_string(const char *string, + const char *what, + char **new_string) +{ + byte *dup; + int result = UDS_ALLOCATE(strlen(string) + 1, byte, what, &dup); + + if (result != UDS_SUCCESS) { + return result; + } + + memcpy(dup, string, strlen(string) + 1); + *new_string = dup; + return UDS_SUCCESS; +} + void uds_memory_init(void) { @@ -404,7 +383,6 @@ void uds_memory_init(void) uds_initialize_thread_registry(&allocating_threads); } -/**********************************************************************/ void uds_memory_exit(void) { @@ -419,22 +397,22 @@ void uds_memory_exit(void) uds_log_debug("peak usage %zd bytes", memory_stats.peak_bytes); } -/**********************************************************************/ void get_uds_memory_stats(uint64_t *bytes_used, uint64_t *peak_bytes_used) { unsigned long flags; + spin_lock_irqsave(&memory_stats.lock, flags); *bytes_used = memory_stats.kmalloc_bytes + memory_stats.vmalloc_bytes; *peak_bytes_used = memory_stats.peak_bytes; spin_unlock_irqrestore(&memory_stats.lock, flags); } -/**********************************************************************/ void report_uds_memory_usage(void) { unsigned long flags; uint64_t kmalloc_blocks, kmalloc_bytes, vmalloc_blocks, vmalloc_bytes; uint64_t peak_usage, total_bytes; + spin_lock_irqsave(&memory_stats.lock, flags); kmalloc_blocks = memory_stats.kmalloc_blocks; kmalloc_bytes = memory_stats.kmalloc_bytes; diff --git a/uds/memoryAlloc.h b/vdo/memory-alloc.h similarity index 86% rename from uds/memoryAlloc.h rename to vdo/memory-alloc.h index 245615a4..02dc5dc2 100644 --- a/uds/memoryAlloc.h +++ b/vdo/memory-alloc.h @@ -1,22 +1,6 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/krusty/src/uds/memoryAlloc.h#21 $ */ #ifndef MEMORY_ALLOC_H @@ -25,11 +9,10 @@ #include "compiler.h" #include "cpu.h" #include "permassert.h" -#include "typeDefs.h" +#include "type-defs.h" -#include // for PAGE_SIZE -#include -#include "threadRegistry.h" +#include /* for PAGE_SIZE */ +#include "thread-registry.h" /** * Allocate storage based on memory size and alignment, logging an error if @@ -54,6 +37,13 @@ int __must_check uds_allocate_memory(size_t size, **/ void uds_free_memory(void *ptr); +/** + * Free memory allocated with UDS_ALLOCATE(). + * + * @param PTR Pointer to the memory to free + **/ +#define UDS_FREE(PTR) uds_free_memory(PTR) + /** * Null out a reference and return a copy of the referenced object. * @@ -113,7 +103,7 @@ static INLINE int uds_do_allocation(size_t count, void *ptr) { size_t total_size = count * size + extra; - // Overflow check: + /* Overflow check: */ if ((size > 0) && (count > ((SIZE_MAX - extra) / size))) { /* * This is kind of a hack: We rely on the fact that SIZE_MAX @@ -205,16 +195,6 @@ int __must_check uds_reallocate_memory(void *ptr, #define UDS_ALLOCATE_IO_ALIGNED(COUNT, TYPE, WHAT, PTR) \ uds_do_allocation(COUNT, sizeof(TYPE), 0, PAGE_SIZE, WHAT, PTR) -/** - * Free memory allocated with UDS_ALLOCATE(). - * - * @param ptr Pointer to the memory to free - **/ -static INLINE void UDS_FREE(void *ptr) -{ - uds_free_memory(ptr); -} - /** * Allocate memory starting on a cache line boundary, logging an error if the * allocation fails. The memory will be zeroed. @@ -269,21 +249,6 @@ int __must_check uds_duplicate_string(const char *string, const char *what, char **new_string); -/** - * Duplicate a buffer, logging an error if the allocation fails. - * - * @param ptr The buffer to copy - * @param size The size of the buffer - * @param what What is being duplicated (for error logging) - * @param dup_ptr A pointer to hold the allocated array - * - * @return UDS_SUCCESS or -ENOMEM - **/ -int __must_check uds_memdup(const void *ptr, - size_t size, - const char *what, - void *dup_ptr); - /** * Wrapper which permits freeing a const pointer. * diff --git a/vdo/messageStats.c b/vdo/messageStats.c index a9b60376..be4cac38 100644 --- a/vdo/messageStats.c +++ b/vdo/messageStats.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright Red Hat * @@ -17,15 +18,14 @@ * 02110-1301, USA. */ -#include "dedupeIndex.h" +#include "dedupe-index.h" #include "logger.h" -#include "memoryAlloc.h" +#include "memory-alloc.h" #include "messageStats.h" #include "statistics.h" -#include "threadDevice.h" +#include "thread-device.h" #include "vdo.h" -/**********************************************************************/ int write_uint64_t(char *prefix, uint64_t value, char *suffix, @@ -44,7 +44,6 @@ int write_uint64_t(char *prefix, return VDO_SUCCESS; } -/**********************************************************************/ int write_uint32_t(char *prefix, uint32_t value, char *suffix, @@ -63,7 +62,6 @@ int write_uint32_t(char *prefix, return VDO_SUCCESS; } -/**********************************************************************/ int write_block_count_t(char *prefix, block_count_t value, char *suffix, @@ -82,7 +80,6 @@ int write_block_count_t(char *prefix, return VDO_SUCCESS; } -/**********************************************************************/ int write_string(char *prefix, char *value, char *suffix, @@ -101,7 +98,6 @@ int write_string(char *prefix, return VDO_SUCCESS; } -/**********************************************************************/ int write_bool(char *prefix, bool value, char *suffix, @@ -120,7 +116,6 @@ int write_bool(char *prefix, return VDO_SUCCESS; } -/**********************************************************************/ int write_uint8_t(char *prefix, uint8_t value, char *suffix, @@ -139,7 +134,6 @@ int write_uint8_t(char *prefix, return VDO_SUCCESS; } -/**********************************************************************/ int write_block_allocator_statistics(char *prefix, struct block_allocator_statistics *stats, char *suffix, @@ -150,7 +144,7 @@ int write_block_allocator_statistics(char *prefix, if (result != VDO_SUCCESS) { return result; } - /** The total number of slabs from which blocks may be allocated */ + /* The total number of slabs from which blocks may be allocated */ result = write_uint64_t("slabCount : ", stats->slab_count, ", ", @@ -159,7 +153,7 @@ int write_block_allocator_statistics(char *prefix, if (result != VDO_SUCCESS) { return result; } - /** The total number of slabs from which blocks have ever been allocated */ + /* The total number of slabs from which blocks have ever been allocated */ result = write_uint64_t("slabsOpened : ", stats->slabs_opened, ", ", @@ -168,7 +162,7 @@ int write_block_allocator_statistics(char *prefix, if (result != VDO_SUCCESS) { return result; } - /** The number of times since loading that a slab has been re-opened */ + /* The number of times since loading that a slab has been re-opened */ result = write_uint64_t("slabsReopened : ", stats->slabs_reopened, ", ", @@ -184,7 +178,6 @@ int write_block_allocator_statistics(char *prefix, return VDO_SUCCESS; } -/**********************************************************************/ int write_commit_statistics(char *prefix, struct commit_statistics *stats, char *suffix, @@ -195,7 +188,7 @@ int write_commit_statistics(char *prefix, if (result != VDO_SUCCESS) { return result; } - /** The total number of items on which processing has started */ + /* The total number of items on which processing has started */ result = write_uint64_t("started : ", stats->started, ", ", @@ -204,7 +197,7 @@ int write_commit_statistics(char *prefix, if (result != VDO_SUCCESS) { return result; } - /** The total number of items for which a write operation has been issued */ + /* The total number of items for which a write operation has been issued */ result = write_uint64_t("written : ", stats->written, ", ", @@ -213,7 +206,7 @@ int write_commit_statistics(char *prefix, if (result != VDO_SUCCESS) { return result; } - /** The total number of items for which a write operation has completed */ + /* The total number of items for which a write operation has completed */ result = write_uint64_t("committed : ", stats->committed, ", ", @@ -229,7 +222,6 @@ int write_commit_statistics(char *prefix, return VDO_SUCCESS; } -/**********************************************************************/ int write_recovery_journal_statistics(char *prefix, struct recovery_journal_statistics *stats, char *suffix, @@ -240,7 +232,7 @@ int write_recovery_journal_statistics(char *prefix, if (result != VDO_SUCCESS) { return result; } - /** Number of times the on-disk journal was full */ + /* Number of times the on-disk journal was full */ result = write_uint64_t("diskFull : ", stats->disk_full, ", ", @@ -249,7 +241,7 @@ int write_recovery_journal_statistics(char *prefix, if (result != VDO_SUCCESS) { return result; } - /** Number of times the recovery journal requested slab journal commits. */ + /* Number of times the recovery journal requested slab journal commits. */ result = write_uint64_t("slabJournalCommitsRequested : ", stats->slab_journal_commits_requested, ", ", @@ -258,7 +250,7 @@ int write_recovery_journal_statistics(char *prefix, if (result != VDO_SUCCESS) { return result; } - /** Write/Commit totals for individual journal entries */ + /* Write/Commit totals for individual journal entries */ result = write_commit_statistics("entries : ", &stats->entries, ", ", @@ -267,7 +259,7 @@ int write_recovery_journal_statistics(char *prefix, if (result != VDO_SUCCESS) { return result; } - /** Write/Commit totals for journal blocks */ + /* Write/Commit totals for journal blocks */ result = write_commit_statistics("blocks : ", &stats->blocks, ", ", @@ -283,7 +275,6 @@ int write_recovery_journal_statistics(char *prefix, return VDO_SUCCESS; } -/**********************************************************************/ int write_packer_statistics(char *prefix, struct packer_statistics *stats, char *suffix, @@ -294,7 +285,7 @@ int write_packer_statistics(char *prefix, if (result != VDO_SUCCESS) { return result; } - /** Number of compressed data items written since startup */ + /* Number of compressed data items written since startup */ result = write_uint64_t("compressedFragmentsWritten : ", stats->compressed_fragments_written, ", ", @@ -303,7 +294,7 @@ int write_packer_statistics(char *prefix, if (result != VDO_SUCCESS) { return result; } - /** Number of blocks containing compressed items written since startup */ + /* Number of blocks containing compressed items written since startup */ result = write_uint64_t("compressedBlocksWritten : ", stats->compressed_blocks_written, ", ", @@ -312,7 +303,7 @@ int write_packer_statistics(char *prefix, if (result != VDO_SUCCESS) { return result; } - /** Number of VIOs that are pending in the packer */ + /* Number of VIOs that are pending in the packer */ result = write_uint64_t("compressedFragmentsInPacker : ", stats->compressed_fragments_in_packer, ", ", @@ -328,7 +319,6 @@ int write_packer_statistics(char *prefix, return VDO_SUCCESS; } -/**********************************************************************/ int write_slab_journal_statistics(char *prefix, struct slab_journal_statistics *stats, char *suffix, @@ -339,7 +329,7 @@ int write_slab_journal_statistics(char *prefix, if (result != VDO_SUCCESS) { return result; } - /** Number of times the on-disk journal was full */ + /* Number of times the on-disk journal was full */ result = write_uint64_t("diskFullCount : ", stats->disk_full_count, ", ", @@ -348,7 +338,7 @@ int write_slab_journal_statistics(char *prefix, if (result != VDO_SUCCESS) { return result; } - /** Number of times an entry was added over the flush threshold */ + /* Number of times an entry was added over the flush threshold */ result = write_uint64_t("flushCount : ", stats->flush_count, ", ", @@ -357,7 +347,7 @@ int write_slab_journal_statistics(char *prefix, if (result != VDO_SUCCESS) { return result; } - /** Number of times an entry was added over the block threshold */ + /* Number of times an entry was added over the block threshold */ result = write_uint64_t("blockedCount : ", stats->blocked_count, ", ", @@ -366,7 +356,7 @@ int write_slab_journal_statistics(char *prefix, if (result != VDO_SUCCESS) { return result; } - /** Number of times a tail block was written */ + /* Number of times a tail block was written */ result = write_uint64_t("blocksWritten : ", stats->blocks_written, ", ", @@ -375,7 +365,7 @@ int write_slab_journal_statistics(char *prefix, if (result != VDO_SUCCESS) { return result; } - /** Number of times we had to wait for the tail to write */ + /* Number of times we had to wait for the tail to write */ result = write_uint64_t("tailBusyCount : ", stats->tail_busy_count, ", ", @@ -391,7 +381,6 @@ int write_slab_journal_statistics(char *prefix, return VDO_SUCCESS; } -/**********************************************************************/ int write_slab_summary_statistics(char *prefix, struct slab_summary_statistics *stats, char *suffix, @@ -402,7 +391,7 @@ int write_slab_summary_statistics(char *prefix, if (result != VDO_SUCCESS) { return result; } - /** Number of blocks written */ + /* Number of blocks written */ result = write_uint64_t("blocksWritten : ", stats->blocks_written, ", ", @@ -418,7 +407,6 @@ int write_slab_summary_statistics(char *prefix, return VDO_SUCCESS; } -/**********************************************************************/ int write_ref_counts_statistics(char *prefix, struct ref_counts_statistics *stats, char *suffix, @@ -429,7 +417,7 @@ int write_ref_counts_statistics(char *prefix, if (result != VDO_SUCCESS) { return result; } - /** Number of reference blocks written */ + /* Number of reference blocks written */ result = write_uint64_t("blocksWritten : ", stats->blocks_written, ", ", @@ -445,7 +433,6 @@ int write_ref_counts_statistics(char *prefix, return VDO_SUCCESS; } -/**********************************************************************/ int write_block_map_statistics(char *prefix, struct block_map_statistics *stats, char *suffix, @@ -456,7 +443,7 @@ int write_block_map_statistics(char *prefix, if (result != VDO_SUCCESS) { return result; } - /** number of dirty (resident) pages */ + /* number of dirty (resident) pages */ result = write_uint32_t("dirtyPages : ", stats->dirty_pages, ", ", @@ -465,7 +452,7 @@ int write_block_map_statistics(char *prefix, if (result != VDO_SUCCESS) { return result; } - /** number of clean (resident) pages */ + /* number of clean (resident) pages */ result = write_uint32_t("cleanPages : ", stats->clean_pages, ", ", @@ -474,7 +461,7 @@ int write_block_map_statistics(char *prefix, if (result != VDO_SUCCESS) { return result; } - /** number of free pages */ + /* number of free pages */ result = write_uint32_t("freePages : ", stats->free_pages, ", ", @@ -483,7 +470,7 @@ int write_block_map_statistics(char *prefix, if (result != VDO_SUCCESS) { return result; } - /** number of pages in failed state */ + /* number of pages in failed state */ result = write_uint32_t("failedPages : ", stats->failed_pages, ", ", @@ -492,7 +479,7 @@ int write_block_map_statistics(char *prefix, if (result != VDO_SUCCESS) { return result; } - /** number of pages incoming */ + /* number of pages incoming */ result = write_uint32_t("incomingPages : ", stats->incoming_pages, ", ", @@ -501,7 +488,7 @@ int write_block_map_statistics(char *prefix, if (result != VDO_SUCCESS) { return result; } - /** number of pages outgoing */ + /* number of pages outgoing */ result = write_uint32_t("outgoingPages : ", stats->outgoing_pages, ", ", @@ -510,7 +497,7 @@ int write_block_map_statistics(char *prefix, if (result != VDO_SUCCESS) { return result; } - /** how many times free page not avail */ + /* how many times free page not avail */ result = write_uint32_t("cachePressure : ", stats->cache_pressure, ", ", @@ -519,7 +506,7 @@ int write_block_map_statistics(char *prefix, if (result != VDO_SUCCESS) { return result; } - /** number of get_vdo_page() calls for read */ + /* number of get_vdo_page() calls for read */ result = write_uint64_t("readCount : ", stats->read_count, ", ", @@ -528,7 +515,7 @@ int write_block_map_statistics(char *prefix, if (result != VDO_SUCCESS) { return result; } - /** number of get_vdo_page() calls for write */ + /* number of get_vdo_page() calls for write */ result = write_uint64_t("writeCount : ", stats->write_count, ", ", @@ -537,7 +524,7 @@ int write_block_map_statistics(char *prefix, if (result != VDO_SUCCESS) { return result; } - /** number of times pages failed to read */ + /* number of times pages failed to read */ result = write_uint64_t("failedReads : ", stats->failed_reads, ", ", @@ -546,7 +533,7 @@ int write_block_map_statistics(char *prefix, if (result != VDO_SUCCESS) { return result; } - /** number of times pages failed to write */ + /* number of times pages failed to write */ result = write_uint64_t("failedWrites : ", stats->failed_writes, ", ", @@ -555,7 +542,7 @@ int write_block_map_statistics(char *prefix, if (result != VDO_SUCCESS) { return result; } - /** number of gets that are reclaimed */ + /* number of gets that are reclaimed */ result = write_uint64_t("reclaimed : ", stats->reclaimed, ", ", @@ -564,7 +551,7 @@ int write_block_map_statistics(char *prefix, if (result != VDO_SUCCESS) { return result; } - /** number of gets for outgoing pages */ + /* number of gets for outgoing pages */ result = write_uint64_t("readOutgoing : ", stats->read_outgoing, ", ", @@ -573,7 +560,7 @@ int write_block_map_statistics(char *prefix, if (result != VDO_SUCCESS) { return result; } - /** number of gets that were already there */ + /* number of gets that were already there */ result = write_uint64_t("foundInCache : ", stats->found_in_cache, ", ", @@ -582,7 +569,7 @@ int write_block_map_statistics(char *prefix, if (result != VDO_SUCCESS) { return result; } - /** number of gets requiring discard */ + /* number of gets requiring discard */ result = write_uint64_t("discardRequired : ", stats->discard_required, ", ", @@ -591,7 +578,7 @@ int write_block_map_statistics(char *prefix, if (result != VDO_SUCCESS) { return result; } - /** number of gets enqueued for their page */ + /* number of gets enqueued for their page */ result = write_uint64_t("waitForPage : ", stats->wait_for_page, ", ", @@ -600,7 +587,7 @@ int write_block_map_statistics(char *prefix, if (result != VDO_SUCCESS) { return result; } - /** number of gets that have to fetch */ + /* number of gets that have to fetch */ result = write_uint64_t("fetchRequired : ", stats->fetch_required, ", ", @@ -609,7 +596,7 @@ int write_block_map_statistics(char *prefix, if (result != VDO_SUCCESS) { return result; } - /** number of page fetches */ + /* number of page fetches */ result = write_uint64_t("pagesLoaded : ", stats->pages_loaded, ", ", @@ -618,7 +605,7 @@ int write_block_map_statistics(char *prefix, if (result != VDO_SUCCESS) { return result; } - /** number of page saves */ + /* number of page saves */ result = write_uint64_t("pagesSaved : ", stats->pages_saved, ", ", @@ -627,7 +614,7 @@ int write_block_map_statistics(char *prefix, if (result != VDO_SUCCESS) { return result; } - /** the number of flushes issued */ + /* the number of flushes issued */ result = write_uint64_t("flushCount : ", stats->flush_count, ", ", @@ -643,7 +630,6 @@ int write_block_map_statistics(char *prefix, return VDO_SUCCESS; } -/**********************************************************************/ int write_hash_lock_statistics(char *prefix, struct hash_lock_statistics *stats, char *suffix, @@ -654,7 +640,7 @@ int write_hash_lock_statistics(char *prefix, if (result != VDO_SUCCESS) { return result; } - /** Number of times the UDS advice proved correct */ + /* Number of times the UDS advice proved correct */ result = write_uint64_t("dedupeAdviceValid : ", stats->dedupe_advice_valid, ", ", @@ -663,7 +649,7 @@ int write_hash_lock_statistics(char *prefix, if (result != VDO_SUCCESS) { return result; } - /** Number of times the UDS advice proved incorrect */ + /* Number of times the UDS advice proved incorrect */ result = write_uint64_t("dedupeAdviceStale : ", stats->dedupe_advice_stale, ", ", @@ -672,7 +658,7 @@ int write_hash_lock_statistics(char *prefix, if (result != VDO_SUCCESS) { return result; } - /** Number of writes with the same data as another in-flight write */ + /* Number of writes with the same data as another in-flight write */ result = write_uint64_t("concurrentDataMatches : ", stats->concurrent_data_matches, ", ", @@ -681,7 +667,7 @@ int write_hash_lock_statistics(char *prefix, if (result != VDO_SUCCESS) { return result; } - /** Number of writes whose hash collided with an in-flight write */ + /* Number of writes whose hash collided with an in-flight write */ result = write_uint64_t("concurrentHashCollisions : ", stats->concurrent_hash_collisions, ", ", @@ -697,7 +683,6 @@ int write_hash_lock_statistics(char *prefix, return VDO_SUCCESS; } -/**********************************************************************/ int write_error_statistics(char *prefix, struct error_statistics *stats, char *suffix, @@ -708,7 +693,7 @@ int write_error_statistics(char *prefix, if (result != VDO_SUCCESS) { return result; } - /** number of times VDO got an invalid dedupe advice PBN from UDS */ + /* number of times VDO got an invalid dedupe advice PBN from UDS */ result = write_uint64_t("invalidAdvicePBNCount : ", stats->invalid_advice_pbn_count, ", ", @@ -717,7 +702,7 @@ int write_error_statistics(char *prefix, if (result != VDO_SUCCESS) { return result; } - /** number of times a VIO completed with a VDO_NO_SPACE error */ + /* number of times a VIO completed with a VDO_NO_SPACE error */ result = write_uint64_t("noSpaceErrorCount : ", stats->no_space_error_count, ", ", @@ -726,7 +711,7 @@ int write_error_statistics(char *prefix, if (result != VDO_SUCCESS) { return result; } - /** number of times a VIO completed with a VDO_READ_ONLY error */ + /* number of times a VIO completed with a VDO_READ_ONLY error */ result = write_uint64_t("readOnlyErrorCount : ", stats->read_only_error_count, ", ", @@ -742,7 +727,6 @@ int write_error_statistics(char *prefix, return VDO_SUCCESS; } -/**********************************************************************/ int write_bio_stats(char *prefix, struct bio_stats *stats, char *suffix, @@ -753,7 +737,7 @@ int write_bio_stats(char *prefix, if (result != VDO_SUCCESS) { return result; } - /** Number of REQ_OP_READ bios */ + /* Number of REQ_OP_READ bios */ result = write_uint64_t("read : ", stats->read, ", ", @@ -762,7 +746,7 @@ int write_bio_stats(char *prefix, if (result != VDO_SUCCESS) { return result; } - /** Number of REQ_OP_WRITE bios with data */ + /* Number of REQ_OP_WRITE bios with data */ result = write_uint64_t("write : ", stats->write, ", ", @@ -771,7 +755,7 @@ int write_bio_stats(char *prefix, if (result != VDO_SUCCESS) { return result; } - /** Number of bios tagged with REQ_PREFLUSH and containing no data */ + /* Number of bios tagged with REQ_PREFLUSH and containing no data */ result = write_uint64_t("emptyFlush : ", stats->empty_flush, ", ", @@ -780,7 +764,7 @@ int write_bio_stats(char *prefix, if (result != VDO_SUCCESS) { return result; } - /** Number of REQ_OP_DISCARD bios */ + /* Number of REQ_OP_DISCARD bios */ result = write_uint64_t("discard : ", stats->discard, ", ", @@ -789,7 +773,7 @@ int write_bio_stats(char *prefix, if (result != VDO_SUCCESS) { return result; } - /** Number of bios tagged with REQ_PREFLUSH */ + /* Number of bios tagged with REQ_PREFLUSH */ result = write_uint64_t("flush : ", stats->flush, ", ", @@ -798,7 +782,7 @@ int write_bio_stats(char *prefix, if (result != VDO_SUCCESS) { return result; } - /** Number of bios tagged with REQ_FUA */ + /* Number of bios tagged with REQ_FUA */ result = write_uint64_t("fua : ", stats->fua, ", ", @@ -814,7 +798,6 @@ int write_bio_stats(char *prefix, return VDO_SUCCESS; } -/**********************************************************************/ int write_memory_usage(char *prefix, struct memory_usage *stats, char *suffix, @@ -825,7 +808,7 @@ int write_memory_usage(char *prefix, if (result != VDO_SUCCESS) { return result; } - /** Tracked bytes currently allocated. */ + /* Tracked bytes currently allocated. */ result = write_uint64_t("bytesUsed : ", stats->bytes_used, ", ", @@ -834,7 +817,7 @@ int write_memory_usage(char *prefix, if (result != VDO_SUCCESS) { return result; } - /** Maximum tracked bytes allocated. */ + /* Maximum tracked bytes allocated. */ result = write_uint64_t("peakBytesUsed : ", stats->peak_bytes_used, ", ", @@ -850,7 +833,6 @@ int write_memory_usage(char *prefix, return VDO_SUCCESS; } -/**********************************************************************/ int write_index_statistics(char *prefix, struct index_statistics *stats, char *suffix, @@ -861,7 +843,7 @@ int write_index_statistics(char *prefix, if (result != VDO_SUCCESS) { return result; } - /** Number of chunk names stored in the index */ + /* Number of chunk names stored in the index */ result = write_uint64_t("entriesIndexed : ", stats->entries_indexed, ", ", @@ -870,7 +852,7 @@ int write_index_statistics(char *prefix, if (result != VDO_SUCCESS) { return result; } - /** Number of post calls that found an existing entry */ + /* Number of post calls that found an existing entry */ result = write_uint64_t("postsFound : ", stats->posts_found, ", ", @@ -879,7 +861,7 @@ int write_index_statistics(char *prefix, if (result != VDO_SUCCESS) { return result; } - /** Number of post calls that added a new entry */ + /* Number of post calls that added a new entry */ result = write_uint64_t("postsNotFound : ", stats->posts_not_found, ", ", @@ -888,7 +870,7 @@ int write_index_statistics(char *prefix, if (result != VDO_SUCCESS) { return result; } - /** Number of query calls that found an existing entry */ + /* Number of query calls that found an existing entry */ result = write_uint64_t("queriesFound : ", stats->queries_found, ", ", @@ -897,7 +879,7 @@ int write_index_statistics(char *prefix, if (result != VDO_SUCCESS) { return result; } - /** Number of query calls that added a new entry */ + /* Number of query calls that added a new entry */ result = write_uint64_t("queriesNotFound : ", stats->queries_not_found, ", ", @@ -906,7 +888,7 @@ int write_index_statistics(char *prefix, if (result != VDO_SUCCESS) { return result; } - /** Number of update calls that found an existing entry */ + /* Number of update calls that found an existing entry */ result = write_uint64_t("updatesFound : ", stats->updates_found, ", ", @@ -915,7 +897,7 @@ int write_index_statistics(char *prefix, if (result != VDO_SUCCESS) { return result; } - /** Number of update calls that added a new entry */ + /* Number of update calls that added a new entry */ result = write_uint64_t("updatesNotFound : ", stats->updates_not_found, ", ", @@ -924,7 +906,7 @@ int write_index_statistics(char *prefix, if (result != VDO_SUCCESS) { return result; } - /** Current number of dedupe queries that are in flight */ + /* Current number of dedupe queries that are in flight */ result = write_uint32_t("currDedupeQueries : ", stats->curr_dedupe_queries, ", ", @@ -933,7 +915,7 @@ int write_index_statistics(char *prefix, if (result != VDO_SUCCESS) { return result; } - /** Maximum number of dedupe queries that have been in flight */ + /* Maximum number of dedupe queries that have been in flight */ result = write_uint32_t("maxDedupeQueries : ", stats->max_dedupe_queries, ", ", @@ -949,7 +931,6 @@ int write_index_statistics(char *prefix, return VDO_SUCCESS; } -/**********************************************************************/ int write_vdo_statistics(char *prefix, struct vdo_statistics *stats, char *suffix, @@ -976,7 +957,7 @@ int write_vdo_statistics(char *prefix, if (result != VDO_SUCCESS) { return result; } - /** Number of blocks used for data */ + /* Number of blocks used for data */ result = write_uint64_t("dataBlocksUsed : ", stats->data_blocks_used, ", ", @@ -985,7 +966,7 @@ int write_vdo_statistics(char *prefix, if (result != VDO_SUCCESS) { return result; } - /** Number of blocks used for VDO metadata */ + /* Number of blocks used for VDO metadata */ result = write_uint64_t("overheadBlocksUsed : ", stats->overhead_blocks_used, ", ", @@ -994,7 +975,7 @@ int write_vdo_statistics(char *prefix, if (result != VDO_SUCCESS) { return result; } - /** Number of logical blocks that are currently mapped to physical blocks */ + /* Number of logical blocks that are currently mapped to physical blocks */ result = write_uint64_t("logicalBlocksUsed : ", stats->logical_blocks_used, ", ", @@ -1003,7 +984,7 @@ int write_vdo_statistics(char *prefix, if (result != VDO_SUCCESS) { return result; } - /** number of physical blocks */ + /* number of physical blocks */ result = write_block_count_t("physicalBlocks : ", stats->physical_blocks, ", ", @@ -1012,7 +993,7 @@ int write_vdo_statistics(char *prefix, if (result != VDO_SUCCESS) { return result; } - /** number of logical blocks */ + /* number of logical blocks */ result = write_block_count_t("logicalBlocks : ", stats->logical_blocks, ", ", @@ -1021,7 +1002,7 @@ int write_vdo_statistics(char *prefix, if (result != VDO_SUCCESS) { return result; } - /** Size of the block map page cache, in bytes */ + /* Size of the block map page cache, in bytes */ result = write_uint64_t("blockMapCacheSize : ", stats->block_map_cache_size, ", ", @@ -1030,7 +1011,7 @@ int write_vdo_statistics(char *prefix, if (result != VDO_SUCCESS) { return result; } - /** The physical block size */ + /* The physical block size */ result = write_uint64_t("blockSize : ", stats->block_size, ", ", @@ -1039,7 +1020,7 @@ int write_vdo_statistics(char *prefix, if (result != VDO_SUCCESS) { return result; } - /** Number of times the VDO has successfully recovered */ + /* Number of times the VDO has successfully recovered */ result = write_uint64_t("completeRecoveries : ", stats->complete_recoveries, ", ", @@ -1048,7 +1029,7 @@ int write_vdo_statistics(char *prefix, if (result != VDO_SUCCESS) { return result; } - /** Number of times the VDO has recovered from read-only mode */ + /* Number of times the VDO has recovered from read-only mode */ result = write_uint64_t("readOnlyRecoveries : ", stats->read_only_recoveries, ", ", @@ -1057,7 +1038,7 @@ int write_vdo_statistics(char *prefix, if (result != VDO_SUCCESS) { return result; } - /** String describing the operating mode of the VDO */ + /* String describing the operating mode of the VDO */ result = write_string("mode : ", stats->mode, ", ", @@ -1066,7 +1047,7 @@ int write_vdo_statistics(char *prefix, if (result != VDO_SUCCESS) { return result; } - /** Whether the VDO is in recovery mode */ + /* Whether the VDO is in recovery mode */ result = write_bool("inRecoveryMode : ", stats->in_recovery_mode, ", ", @@ -1075,7 +1056,7 @@ int write_vdo_statistics(char *prefix, if (result != VDO_SUCCESS) { return result; } - /** What percentage of recovery mode work has been completed */ + /* What percentage of recovery mode work has been completed */ result = write_uint8_t("recoveryPercentage : ", stats->recovery_percentage, ", ", @@ -1084,7 +1065,7 @@ int write_vdo_statistics(char *prefix, if (result != VDO_SUCCESS) { return result; } - /** The statistics for the compressed block packer */ + /* The statistics for the compressed block packer */ result = write_packer_statistics("packer : ", &stats->packer, ", ", @@ -1093,7 +1074,7 @@ int write_vdo_statistics(char *prefix, if (result != VDO_SUCCESS) { return result; } - /** Counters for events in the block allocator */ + /* Counters for events in the block allocator */ result = write_block_allocator_statistics("allocator : ", &stats->allocator, ", ", @@ -1102,7 +1083,7 @@ int write_vdo_statistics(char *prefix, if (result != VDO_SUCCESS) { return result; } - /** Counters for events in the recovery journal */ + /* Counters for events in the recovery journal */ result = write_recovery_journal_statistics("journal : ", &stats->journal, ", ", @@ -1111,7 +1092,7 @@ int write_vdo_statistics(char *prefix, if (result != VDO_SUCCESS) { return result; } - /** The statistics for the slab journals */ + /* The statistics for the slab journals */ result = write_slab_journal_statistics("slabJournal : ", &stats->slab_journal, ", ", @@ -1120,7 +1101,7 @@ int write_vdo_statistics(char *prefix, if (result != VDO_SUCCESS) { return result; } - /** The statistics for the slab summary */ + /* The statistics for the slab summary */ result = write_slab_summary_statistics("slabSummary : ", &stats->slab_summary, ", ", @@ -1129,7 +1110,7 @@ int write_vdo_statistics(char *prefix, if (result != VDO_SUCCESS) { return result; } - /** The statistics for the reference counts */ + /* The statistics for the reference counts */ result = write_ref_counts_statistics("refCounts : ", &stats->ref_counts, ", ", @@ -1138,7 +1119,7 @@ int write_vdo_statistics(char *prefix, if (result != VDO_SUCCESS) { return result; } - /** The statistics for the block map */ + /* The statistics for the block map */ result = write_block_map_statistics("blockMap : ", &stats->block_map, ", ", @@ -1147,7 +1128,7 @@ int write_vdo_statistics(char *prefix, if (result != VDO_SUCCESS) { return result; } - /** The dedupe statistics from hash locks */ + /* The dedupe statistics from hash locks */ result = write_hash_lock_statistics("hashLock : ", &stats->hash_lock, ", ", @@ -1156,7 +1137,7 @@ int write_vdo_statistics(char *prefix, if (result != VDO_SUCCESS) { return result; } - /** Counts of error conditions */ + /* Counts of error conditions */ result = write_error_statistics("errors : ", &stats->errors, ", ", @@ -1165,7 +1146,7 @@ int write_vdo_statistics(char *prefix, if (result != VDO_SUCCESS) { return result; } - /** The VDO instance */ + /* The VDO instance */ result = write_uint32_t("instance : ", stats->instance, ", ", @@ -1174,7 +1155,7 @@ int write_vdo_statistics(char *prefix, if (result != VDO_SUCCESS) { return result; } - /** Current number of active VIOs */ + /* Current number of active VIOs */ result = write_uint32_t("currentVIOsInProgress : ", stats->current_vios_in_progress, ", ", @@ -1183,7 +1164,7 @@ int write_vdo_statistics(char *prefix, if (result != VDO_SUCCESS) { return result; } - /** Maximum number of active VIOs */ + /* Maximum number of active VIOs */ result = write_uint32_t("maxVIOs : ", stats->max_vios, ", ", @@ -1192,7 +1173,7 @@ int write_vdo_statistics(char *prefix, if (result != VDO_SUCCESS) { return result; } - /** Number of times the UDS index was too slow in responding */ + /* Number of times the UDS index was too slow in responding */ result = write_uint64_t("dedupeAdviceTimeouts : ", stats->dedupe_advice_timeouts, ", ", @@ -1201,7 +1182,7 @@ int write_vdo_statistics(char *prefix, if (result != VDO_SUCCESS) { return result; } - /** Number of flush requests submitted to the storage device */ + /* Number of flush requests submitted to the storage device */ result = write_uint64_t("flushOut : ", stats->flush_out, ", ", @@ -1210,7 +1191,7 @@ int write_vdo_statistics(char *prefix, if (result != VDO_SUCCESS) { return result; } - /** Logical block size */ + /* Logical block size */ result = write_uint64_t("logicalBlockSize : ", stats->logical_block_size, ", ", @@ -1219,7 +1200,7 @@ int write_vdo_statistics(char *prefix, if (result != VDO_SUCCESS) { return result; } - /** Bios submitted into VDO from above */ + /* Bios submitted into VDO from above */ result = write_bio_stats("biosIn : ", &stats->bios_in, ", ", @@ -1236,7 +1217,7 @@ int write_vdo_statistics(char *prefix, if (result != VDO_SUCCESS) { return result; } - /** Bios submitted onward for user data */ + /* Bios submitted onward for user data */ result = write_bio_stats("biosOut : ", &stats->bios_out, ", ", @@ -1245,7 +1226,7 @@ int write_vdo_statistics(char *prefix, if (result != VDO_SUCCESS) { return result; } - /** Bios submitted onward for metadata */ + /* Bios submitted onward for metadata */ result = write_bio_stats("biosMeta : ", &stats->bios_meta, ", ", @@ -1318,7 +1299,7 @@ int write_vdo_statistics(char *prefix, if (result != VDO_SUCCESS) { return result; } - /** Current number of bios in progress */ + /* Current number of bios in progress */ result = write_bio_stats("biosInProgress : ", &stats->bios_in_progress, ", ", @@ -1327,7 +1308,7 @@ int write_vdo_statistics(char *prefix, if (result != VDO_SUCCESS) { return result; } - /** Memory usage stats. */ + /* Memory usage stats. */ result = write_memory_usage("memoryUsage : ", &stats->memory_usage, ", ", @@ -1336,7 +1317,7 @@ int write_vdo_statistics(char *prefix, if (result != VDO_SUCCESS) { return result; } - /** The statistics for the UDS index */ + /* The statistics for the UDS index */ result = write_index_statistics("index : ", &stats->index, ", ", @@ -1352,8 +1333,7 @@ int write_vdo_statistics(char *prefix, return VDO_SUCCESS; } -/**********************************************************************/ -int write_vdo_stats(struct vdo *vdo, +int vdo_write_stats(struct vdo *vdo, char *buf, unsigned int maxlen) { @@ -1363,7 +1343,7 @@ int write_vdo_stats(struct vdo *vdo, return result; } - fetch_vdo_statistics(vdo, stats); + vdo_fetch_statistics(vdo, stats); result = write_vdo_statistics(NULL, stats, NULL, &buf, &maxlen); UDS_FREE(stats); return result; diff --git a/vdo/messageStats.h b/vdo/messageStats.h index 0d7e52fa..06bef032 100644 --- a/vdo/messageStats.h +++ b/vdo/messageStats.h @@ -1,23 +1,6 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/kernel/messageStats.h#7 $ - * */ #ifndef MESSAGE_STATS_H @@ -25,13 +8,6 @@ #include "types.h" -/** - * Write vdo statistics to a buffer - * - * @param vdo the vdo - * @param buf pointer to the buffer - * @param maxlen the maximum length of the buffer - */ -int write_vdo_stats(struct vdo *vdo, char *buf, unsigned int maxlen); +int vdo_write_stats(struct vdo *vdo, char *buf, unsigned int maxlen); #endif /* MESSAGE_STATS_H */ diff --git a/vdo/murmurhash3.c b/vdo/murmurhash3.c new file mode 100644 index 00000000..a714cfaf --- /dev/null +++ b/vdo/murmurhash3.c @@ -0,0 +1,179 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ +/* + * MurmurHash3 was written by Austin Appleby, and is placed in the public + * domain. The author hereby disclaims copyright to this source code. + * + * Adapted by John Wiele (jwiele@redhat.com). + */ + +#include +#include + +static inline uint64_t rotl64(uint64_t x, int8_t r) +{ + return (x << r) | (x >> (64 - r)); +} + +#define ROTL64(x, y) rotl64(x, y) +static __always_inline uint64_t getblock64(const uint64_t *p, int i) +{ +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + return p[i]; +#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + return __builtin_bswap64(p[i]); +#else +#error "can't figure out byte order" +#endif +} + +static __always_inline void putblock64(uint64_t *p, int i, uint64_t value) +{ +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + p[i] = value; +#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + p[i] = __builtin_bswap64(value); +#else +#error "can't figure out byte order" +#endif +} + +/* Finalization mix - force all bits of a hash block to avalanche */ + +static __always_inline uint64_t fmix64(uint64_t k) +{ + k ^= k >> 33; + k *= 0xff51afd7ed558ccdLLU; + k ^= k >> 33; + k *= 0xc4ceb9fe1a85ec53LLU; + k ^= k >> 33; + + return k; +} + +void murmurhash3_128(const void *key, const int len, const uint32_t seed, + void *out) +{ + const uint8_t *data = (const uint8_t *)key; + const int nblocks = len / 16; + + uint64_t h1 = seed; + uint64_t h2 = seed; + + const uint64_t c1 = 0x87c37b91114253d5LLU; + const uint64_t c2 = 0x4cf5ad432745937fLLU; + + /* body */ + + const uint64_t *blocks = (const uint64_t *)(data); + + int i; + + for (i = 0; i < nblocks; i++) { + uint64_t k1 = getblock64(blocks, i * 2 + 0); + uint64_t k2 = getblock64(blocks, i * 2 + 1); + + k1 *= c1; + k1 = ROTL64(k1, 31); + k1 *= c2; + h1 ^= k1; + + h1 = ROTL64(h1, 27); + h1 += h2; + h1 = h1 * 5 + 0x52dce729; + + k2 *= c2; + k2 = ROTL64(k2, 33); + k2 *= c1; + h2 ^= k2; + + h2 = ROTL64(h2, 31); + h2 += h1; + h2 = h2 * 5 + 0x38495ab5; + } + + /* tail */ + + { + const uint8_t *tail = (const uint8_t *)(data + nblocks * 16); + + uint64_t k1 = 0; + uint64_t k2 = 0; + + switch (len & 15) { + case 15: + k2 ^= ((uint64_t)tail[14]) << 48; + fallthrough; + case 14: + k2 ^= ((uint64_t)tail[13]) << 40; + fallthrough; + case 13: + k2 ^= ((uint64_t)tail[12]) << 32; + fallthrough; + case 12: + k2 ^= ((uint64_t)tail[11]) << 24; + fallthrough; + case 11: + k2 ^= ((uint64_t)tail[10]) << 16; + fallthrough; + case 10: + k2 ^= ((uint64_t)tail[9]) << 8; + fallthrough; + case 9: + k2 ^= ((uint64_t)tail[8]) << 0; + k2 *= c2; + k2 = ROTL64(k2, 33); + k2 *= c1; + h2 ^= k2; + fallthrough; + + case 8: + k1 ^= ((uint64_t)tail[7]) << 56; + fallthrough; + case 7: + k1 ^= ((uint64_t)tail[6]) << 48; + fallthrough; + case 6: + k1 ^= ((uint64_t)tail[5]) << 40; + fallthrough; + case 5: + k1 ^= ((uint64_t)tail[4]) << 32; + fallthrough; + case 4: + k1 ^= ((uint64_t)tail[3]) << 24; + fallthrough; + case 3: + k1 ^= ((uint64_t)tail[2]) << 16; + fallthrough; + case 2: + k1 ^= ((uint64_t)tail[1]) << 8; + fallthrough; + case 1: + k1 ^= ((uint64_t)tail[0]) << 0; + k1 *= c1; + k1 = ROTL64(k1, 31); + k1 *= c2; + h1 ^= k1; + break; + default: + break; + }; + } + /* finalization */ + + h1 ^= len; + h2 ^= len; + + h1 += h2; + h2 += h1; + + h1 = fmix64(h1); + h2 = fmix64(h2); + + h1 += h2; + h2 += h1; + + putblock64((uint64_t *)out, 0, h1); + putblock64((uint64_t *)out, 1, h2); +} + +EXPORT_SYMBOL(murmurhash3_128); diff --git a/vdo/num-utils.h b/vdo/num-utils.h new file mode 100644 index 00000000..db8fdfc4 --- /dev/null +++ b/vdo/num-utils.h @@ -0,0 +1,18 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright Red Hat + * + * THIS FILE IS A CANDIDATE FOR THE EVENTUAL UTILITY LIBRARY. + */ + +#ifndef NUM_UTILS_H +#define NUM_UTILS_H + +#include "numeric.h" + +#include "types.h" + +#include +#include + +#endif /* NUM_UTILS_H */ diff --git a/vdo/numUtils.h b/vdo/numUtils.h deleted file mode 100644 index 74d793fe..00000000 --- a/vdo/numUtils.h +++ /dev/null @@ -1,116 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/numUtils.h#8 $ - * - * THIS FILE IS A CANDIDATE FOR THE EVENTUAL UTILITY LIBRARY. - */ - -#ifndef NUM_UTILS_H -#define NUM_UTILS_H - -#include "common.h" -#include "numeric.h" - -#include "types.h" - -#include - -/** - * Efficiently calculate the base-2 logarithm of a number truncated to an - * integer value. - * - * This also happens to be the bit index of the highest-order non-zero bit in - * the binary representation of the number, which can easily be used to - * calculate the bit shift corresponding to a bit mask or an array capacity, - * or to calculate the binary floor or ceiling (next lowest or highest power - * of two). - * - * @param n The input value - * - * @return the integer log2 of the value, or -1 if the value is zero - **/ -static inline int log_base_two(uint64_t n) -{ - if (n == 0) { - return -1; - } - // Many CPUs, including x86, directly support this calculation, so use - // the GCC function for counting the number of leading high-order zero - // bits. - return 63 - __builtin_clzll(n); -} - -/** - * Round upward towards the nearest multiple of quantum. - * - * @param number a number - * @param quantum the quantum - * - * @return the least multiple of quantum not less than number - **/ -static inline size_t __must_check -round_up_to_multiple_size_t(size_t number, size_t quantum) -{ - return number + quantum - 1 - ((number + quantum - 1) % quantum); -} - -/** - * Check whether the given value is between the lower and upper bounds, - * within a cyclic range of values from 0 to (modulus - 1). The value - * and both bounds must be smaller than the modulus. - * - * @param lower The lowest value to accept - * @param value The value to check - * @param upper The highest value to accept - * @param modulus The size of the cyclic space, no more than 2^15 - * - * @return true if the value is in range - **/ -static inline bool in_cyclic_range(uint16_t lower, uint16_t value, - uint16_t upper, uint16_t modulus) -{ - if (value < lower) { - value += modulus; - } - if (upper < lower) { - upper += modulus; - } - return (value <= upper); -} - -/** - * Compute the number of buckets of a given size which are required to hold a - * given number of objects. - * - * @param object_count The number of objects to hold - * @param bucket_size The size of a bucket - * - * @return The number of buckets required - **/ -static inline uint64_t compute_bucket_count(uint64_t object_count, - uint64_t bucket_size) -{ - uint64_t quotient = object_count / bucket_size; - if ((object_count % bucket_size) > 0) { - ++quotient; - } - return quotient; -} - -#endif // NUM_UTILS_H diff --git a/uds/numeric.h b/vdo/numeric.h similarity index 88% rename from uds/numeric.h rename to vdo/numeric.h index 8f1595b9..c4bdaa51 100644 --- a/uds/numeric.h +++ b/vdo/numeric.h @@ -1,22 +1,6 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/krusty/src/uds/numeric.h#13 $ */ #ifndef NUMERIC_H diff --git a/vdo/open-chapter.c b/vdo/open-chapter.c new file mode 100644 index 00000000..bba6311b --- /dev/null +++ b/vdo/open-chapter.c @@ -0,0 +1,560 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright Red Hat + */ + +#include "open-chapter.h" + +#include "compiler.h" +#include "config.h" +#include "hash-utils.h" +#include "logger.h" +#include "memory-alloc.h" +#include "numeric.h" +#include "permassert.h" + +/* + * The open chapter tracks the newest records in memory. Although it is + * notionally a single collection, each index zone has a dedicated open chapter + * zone structure and an equal share of the available record space. Records are + * assigned to zones based on their chunk name. + * + * Within each zone, records are stored in an array in the order they arrive. + * Additionally, a reference to each record is stored in a hash table to help + * determine if a new record duplicates an existing one. If new metadata for an + * existing name arrives, the record is altered in place. The array of records + * is 1-based so that record number 0 can be used to indicate an unused hash + * slot. + * + * Deleted records are marked with a flag rather than actually removed to + * simplify hash table management. The array of deleted flags overlays the + * array of hash slots, but the flags are indexed by record number instead of + * by chunk name. The number of hash slots will always be a power of two that + * is greater than the number of records to be indexed, guaranteeing that hash + * insertion cannot fail, and that there are sufficient flags for all records. + * + * Once any open chapter zone fills its available space, the chapter is + * closed. The records from each zone are interleaved to attempt to preserve + * temporal locality and assigned to record pages. Empty or deleted records + * are replaced by copies of a valid record so that the record pages only + * contain valid records. The chapter then constructs a delta index which maps + * each chunk name to the record page on which that record can be found, which + * is split into index pages. These structures are then passed to the volume to + * be recorded on storage. + * + * When the index is saved, the open chapter records are saved in a single + * array, once again interleaved to attempt to preserve temporal locality. When + * the index is reloaded, there may be a different number of zones than + * previously, so the records must be parcelled out to their new zones. In + * addition, depending on the distribution of chunk names, a new zone may have + * more records than it has space. In this case, the latest records for that + * zone will be discarded. + */ + +static const byte OPEN_CHAPTER_MAGIC[] = "ALBOC"; +static const byte OPEN_CHAPTER_VERSION[] = "02.00"; + +enum { + OPEN_CHAPTER_MAGIC_LENGTH = sizeof(OPEN_CHAPTER_MAGIC) - 1, + OPEN_CHAPTER_VERSION_LENGTH = sizeof(OPEN_CHAPTER_VERSION) - 1 +}; + +static INLINE size_t records_size(const struct open_chapter_zone *open_chapter) +{ + return (sizeof(struct uds_chunk_record) * + (1 + open_chapter->capacity)); +} + +static INLINE size_t slots_size(size_t slot_count) +{ + return (sizeof(struct open_chapter_zone_slot) * slot_count); +} + +static INLINE size_t next_power_of_two(size_t val) +{ + if (val == 0) { + return 1; + } + return (1 << compute_bits(val - 1)); +} + +int make_open_chapter(const struct geometry *geometry, + unsigned int zone_count, + struct open_chapter_zone **open_chapter_ptr) +{ + struct open_chapter_zone *open_chapter; + size_t capacity, slot_count; + int result = ASSERT(zone_count > 0, "zone count must be > 0"); + + if (result != UDS_SUCCESS) { + return result; + } + result = + ASSERT_WITH_ERROR_CODE(geometry->open_chapter_load_ratio > 1, + UDS_BAD_STATE, + "Open chapter hash table is too small"); + if (result != UDS_SUCCESS) { + return result; + } + result = ASSERT_WITH_ERROR_CODE((geometry->records_per_chapter <= + OPEN_CHAPTER_MAX_RECORD_NUMBER), + UDS_BAD_STATE, + "Too many records (%u) for a single chapter", + geometry->records_per_chapter); + if (result != UDS_SUCCESS) { + return result; + } + + if (geometry->records_per_chapter < zone_count) { + return uds_log_error_strerror(UDS_INVALID_ARGUMENT, + "zone count: %u is larger than the records per chapter %u", + zone_count, + geometry->records_per_chapter); + } + capacity = geometry->records_per_chapter / zone_count; + + slot_count = next_power_of_two(capacity * + geometry->open_chapter_load_ratio); + result = UDS_ALLOCATE_EXTENDED(struct open_chapter_zone, + slot_count, + struct open_chapter_zone_slot, + "open chapter", + &open_chapter); + if (result != UDS_SUCCESS) { + return result; + } + open_chapter->slot_count = slot_count; + open_chapter->capacity = capacity; + result = uds_allocate_cache_aligned(records_size(open_chapter), + "record pages", + &open_chapter->records); + if (result != UDS_SUCCESS) { + free_open_chapter(open_chapter); + return result; + } + + *open_chapter_ptr = open_chapter; + return UDS_SUCCESS; +} + +/* Compute the number of valid records. */ +size_t open_chapter_size(const struct open_chapter_zone *open_chapter) +{ + return open_chapter->size - open_chapter->deleted; +} + +void reset_open_chapter(struct open_chapter_zone *open_chapter) +{ + open_chapter->size = 0; + open_chapter->deleted = 0; + + memset(open_chapter->records, 0, records_size(open_chapter)); + memset(open_chapter->slots, 0, slots_size(open_chapter->slot_count)); +} + +static struct uds_chunk_record * +probe_chapter_slots(struct open_chapter_zone *open_chapter, + const struct uds_chunk_name *name, + unsigned int *slot_ptr, + unsigned int *record_number_ptr) +{ + unsigned int slots = open_chapter->slot_count; + unsigned int probe = name_to_hash_slot(name, slots); + unsigned int first_slot = 0; + + struct uds_chunk_record *record; + unsigned int probe_slot; + unsigned int record_number; + unsigned int probe_attempts; + + for (probe_attempts = 1;; ++probe_attempts) { + probe_slot = first_slot + probe; + record_number = open_chapter->slots[probe_slot].record_number; + + /* + * If the hash slot is empty, we've reached the end of a chain + * without finding the record and should terminate the search. + */ + if (record_number == 0) { + record = NULL; + break; + } + + /* + * If the name of the record referenced by the slot matches and + * has not been deleted, then we've found the requested name. + */ + record = &open_chapter->records[record_number]; + if ((memcmp(&record->name, name, UDS_CHUNK_NAME_SIZE) == 0) && + !open_chapter->slots[record_number].record_deleted) { + break; + } + + /* + * Quadratic probing: advance the probe by 1, 2, 3, etc. and + * try again. This performs better than linear probing and + * works best for 2^N slots. + */ + probe += probe_attempts; + if (probe >= slots) { + probe = probe % slots; + } + } + + /* + * These NULL checks will be optimized away in callers who don't care + * about the values when this function is inlined. + */ + if (slot_ptr != NULL) { + *slot_ptr = probe_slot; + } + if (record_number_ptr != NULL) { + *record_number_ptr = record_number; + } + + return record; +} + +void search_open_chapter(struct open_chapter_zone *open_chapter, + const struct uds_chunk_name *name, + struct uds_chunk_data *metadata, + bool *found) +{ + struct uds_chunk_record *record = + probe_chapter_slots(open_chapter, name, NULL, NULL); + + if (record == NULL) { + *found = false; + } else { + *found = true; + if (metadata != NULL) { + *metadata = record->data; + } + } +} + +/* Add a record to the open chapter zone and return the remaining space. */ +int put_open_chapter(struct open_chapter_zone *open_chapter, + const struct uds_chunk_name *name, + const struct uds_chunk_data *metadata, + unsigned int *remaining) +{ + unsigned int slot, record_number; + struct uds_chunk_record *record = + probe_chapter_slots(open_chapter, name, &slot, NULL); + + if (record != NULL) { + record->data = *metadata; + *remaining = open_chapter->capacity - open_chapter->size; + return UDS_SUCCESS; + } + + if (open_chapter->size >= open_chapter->capacity) { + return UDS_VOLUME_OVERFLOW; + } + + record_number = ++open_chapter->size; + open_chapter->slots[slot].record_number = record_number; + record = &open_chapter->records[record_number]; + record->name = *name; + record->data = *metadata; + + *remaining = open_chapter->capacity - open_chapter->size; + return UDS_SUCCESS; +} + +void remove_from_open_chapter(struct open_chapter_zone *open_chapter, + const struct uds_chunk_name *name) +{ + unsigned int record_number; + struct uds_chunk_record *record = + probe_chapter_slots(open_chapter, name, NULL, &record_number); + + if (record == NULL) { + return; + } + + /* + * Set the deleted flag on the record_number in the slot array so + * search won't find it and close won't index it. + */ + open_chapter->slots[record_number].record_deleted = true; + open_chapter->deleted += 1; +} + +void free_open_chapter(struct open_chapter_zone *open_chapter) +{ + if (open_chapter != NULL) { + UDS_FREE(open_chapter->records); + UDS_FREE(open_chapter); + } +} + +/* Map each record name to its record page number in the delta chapter index. */ +static int fill_delta_chapter_index(struct open_chapter_zone **chapter_zones, + unsigned int zone_count, + struct open_chapter_index *index, + struct uds_chunk_record *collated_records) +{ + /* + * The record pages should not have any empty space, so find a record + * with which to fill the chapter zone if it was closed early, and also + * to replace any deleted records. The last record in any filled zone + * is guaranteed to not have been deleted, so use one of those. + */ + struct open_chapter_zone *fill_chapter_zone = NULL; + struct uds_chunk_record *fill_record = NULL; + unsigned int z, pages_per_chapter, records_per_page, page; + unsigned int records_added = 0, zone = 0; + int result, overflow_count = 0; + const struct geometry *geometry; + + for (z = 0; z < zone_count; ++z) { + fill_chapter_zone = chapter_zones[z]; + if (fill_chapter_zone->size == fill_chapter_zone->capacity) { + fill_record = + &fill_chapter_zone + ->records[fill_chapter_zone->size]; + break; + } + } + result = + ASSERT((fill_record != NULL), "some open chapter zone filled"); + if (result != UDS_SUCCESS) { + return result; + } + result = ASSERT(!fill_chapter_zone->slots[fill_chapter_zone->size] + .record_deleted, + "chapter fill record not deleted"); + if (result != UDS_SUCCESS) { + return result; + } + + geometry = index->geometry; + pages_per_chapter = geometry->record_pages_per_chapter; + records_per_page = geometry->records_per_page; + + for (page = 0; page < pages_per_chapter; page++) { + unsigned int i; + + for (i = 0; i < records_per_page; + i++, records_added++, zone = (zone + 1) % zone_count) { + struct uds_chunk_record *next_record; + /* The record arrays are 1-based. */ + unsigned int record_number = + 1 + (records_added / zone_count); + + /* Use the fill record in place of an unused record. */ + if (record_number > chapter_zones[zone]->size || + chapter_zones[zone] + ->slots[record_number] + .record_deleted) { + collated_records[1 + records_added] = + *fill_record; + continue; + } + + next_record = + &chapter_zones[zone]->records[record_number]; + collated_records[1 + records_added] = *next_record; + + result = put_open_chapter_index_record(index, + &next_record->name, + page); + switch (result) { + case UDS_SUCCESS: + break; + case UDS_OVERFLOW: + overflow_count++; + break; + default: + uds_log_error_strerror(result, + "failed to build open chapter index"); + return result; + } + } + } + if (overflow_count > 0) { + uds_log_warning("Failed to add %d entries to chapter index", + overflow_count); + } + return UDS_SUCCESS; +} + +int close_open_chapter(struct open_chapter_zone **chapter_zones, + unsigned int zone_count, + struct volume *volume, + struct open_chapter_index *chapter_index, + struct uds_chunk_record *collated_records, + uint64_t virtual_chapter_number) +{ + int result; + + empty_open_chapter_index(chapter_index, virtual_chapter_number); + + result = fill_delta_chapter_index(chapter_zones, zone_count, + chapter_index, collated_records); + if (result != UDS_SUCCESS) { + return result; + } + + return write_chapter(volume, chapter_index, collated_records); +} + +int save_open_chapters(struct uds_index *index, struct buffered_writer *writer) +{ + uint32_t total_records = 0, records_added = 0; + unsigned int i, record_index; + byte total_record_data[sizeof(total_records)]; + int result = write_to_buffered_writer(writer, OPEN_CHAPTER_MAGIC, + OPEN_CHAPTER_MAGIC_LENGTH); + if (result != UDS_SUCCESS) { + return result; + } + + result = write_to_buffered_writer(writer, OPEN_CHAPTER_VERSION, + OPEN_CHAPTER_VERSION_LENGTH); + if (result != UDS_SUCCESS) { + return result; + } + + for (i = 0; i < index->zone_count; i++) { + total_records += + open_chapter_size(index->zones[i]->open_chapter); + } + + put_unaligned_le32(total_records, total_record_data); + + result = write_to_buffered_writer(writer, total_record_data, + sizeof(total_record_data)); + if (result != UDS_SUCCESS) { + return result; + } + + record_index = 1; + while (records_added < total_records) { + unsigned int i; + + for (i = 0; i < index->zone_count; i++) { + struct open_chapter_zone *open_chapter = + index->zones[i]->open_chapter; + struct uds_chunk_record *record; + + if (record_index > open_chapter->size) { + continue; + } + if (open_chapter->slots[record_index].record_deleted) { + continue; + } + record = &open_chapter->records[record_index]; + result = write_to_buffered_writer(writer, + record, + sizeof(struct uds_chunk_record)); + if (result != UDS_SUCCESS) { + return result; + } + records_added++; + } + record_index++; + } + + return flush_buffered_writer(writer); +} + +uint64_t compute_saved_open_chapter_size(struct geometry *geometry) +{ + return OPEN_CHAPTER_MAGIC_LENGTH + OPEN_CHAPTER_VERSION_LENGTH + + sizeof(uint32_t) + + geometry->records_per_chapter * sizeof(struct uds_chunk_record); +} + +static int read_version(struct buffered_reader *reader, const byte **version) +{ + byte buffer[OPEN_CHAPTER_VERSION_LENGTH]; + int result = read_from_buffered_reader(reader, buffer, sizeof(buffer)); + + if (result != UDS_SUCCESS) { + return result; + } + if (memcmp(OPEN_CHAPTER_VERSION, buffer, sizeof(buffer)) != 0) { + return uds_log_error_strerror(UDS_CORRUPT_DATA, + "Invalid open chapter version: %.*s", + (int) sizeof(buffer), + buffer); + } + *version = OPEN_CHAPTER_VERSION; + return UDS_SUCCESS; +} + +static int load_version20(struct uds_index *index, + struct buffered_reader *reader) +{ + uint32_t num_records, records; + byte num_records_data[sizeof(uint32_t)]; + struct uds_chunk_record record; + + /* + * Track which zones cannot accept any more records. If the open + * chapter had a different number of zones previously, some new zones + * may have more records than they have space for. These overflow + * records will be discarded. + */ + bool full_flags[MAX_ZONES] = { + false, + }; + + int result = read_from_buffered_reader(reader, &num_records_data, + sizeof(num_records_data)); + if (result != UDS_SUCCESS) { + return result; + } + num_records = get_unaligned_le32(num_records_data); + + for (records = 0; records < num_records; records++) { + unsigned int zone = 0; + + result = read_from_buffered_reader(reader, &record, + sizeof(struct uds_chunk_record)); + if (result != UDS_SUCCESS) { + return result; + } + + if (index->zone_count > 1) { + zone = get_volume_index_zone(index->volume_index, + &record.name); + } + + if (!full_flags[zone]) { + unsigned int remaining; + + result = put_open_chapter(index->zones[zone]->open_chapter, + &record.name, + &record.data, + &remaining); + /* Do not allow any zone to fill completely. */ + full_flags[zone] = (remaining <= 1); + if (result != UDS_SUCCESS) { + return result; + } + } + } + + return UDS_SUCCESS; +} + +int load_open_chapters(struct uds_index *index, struct buffered_reader *reader) +{ + const byte *version = NULL; + int result = verify_buffered_data(reader, OPEN_CHAPTER_MAGIC, + OPEN_CHAPTER_MAGIC_LENGTH); + if (result != UDS_SUCCESS) { + return result; + } + + result = read_version(reader, &version); + if (result != UDS_SUCCESS) { + return result; + } + + return load_version20(index, reader); +} diff --git a/vdo/open-chapter.h b/vdo/open-chapter.h new file mode 100644 index 00000000..b1239b99 --- /dev/null +++ b/vdo/open-chapter.h @@ -0,0 +1,83 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright Red Hat + */ + +#ifndef OPENCHAPTER_H +#define OPENCHAPTER_H 1 + +#include "chapter-index.h" +#include "common.h" +#include "geometry.h" +#include "index.h" +#include "volume.h" + +enum { + OPEN_CHAPTER_RECORD_NUMBER_BITS = 23, + OPEN_CHAPTER_MAX_RECORD_NUMBER = + (1 << OPEN_CHAPTER_RECORD_NUMBER_BITS) - 1 +}; + +struct open_chapter_zone_slot { + /* If non-zero, the record number addressed by this hash slot */ + unsigned int record_number : OPEN_CHAPTER_RECORD_NUMBER_BITS; + /* If true, the record at the index of this hash slot was deleted */ + bool record_deleted : 1; +} __packed; + +struct open_chapter_zone { + /* The maximum number of records that can be stored */ + unsigned int capacity; + /* The number of records stored */ + unsigned int size; + /* The number of deleted records */ + unsigned int deleted; + /* Array of chunk records, 1-based */ + struct uds_chunk_record *records; + /* The number of slots in the hash table */ + unsigned int slot_count; + /* The hash table slots, referencing virtual record numbers */ + struct open_chapter_zone_slot slots[]; +}; + +int __must_check +make_open_chapter(const struct geometry *geometry, + unsigned int zone_count, + struct open_chapter_zone **open_chapter_ptr); + +size_t __must_check +open_chapter_size(const struct open_chapter_zone *open_chapter); + +void reset_open_chapter(struct open_chapter_zone *open_chapter); + +void search_open_chapter(struct open_chapter_zone *open_chapter, + const struct uds_chunk_name *name, + struct uds_chunk_data *metadata, + bool *found); + +int __must_check put_open_chapter(struct open_chapter_zone *open_chapter, + const struct uds_chunk_name *name, + const struct uds_chunk_data *metadata, + unsigned int *remaining); + +void remove_from_open_chapter(struct open_chapter_zone *open_chapter, + const struct uds_chunk_name *name); + +void free_open_chapter(struct open_chapter_zone *open_chapter); + +int __must_check close_open_chapter(struct open_chapter_zone **chapter_zones, + unsigned int zone_count, + struct volume *volume, + struct open_chapter_index *chapter_index, + struct uds_chunk_record *collated_records, + uint64_t virtual_chapter_number); + +int __must_check save_open_chapters(struct uds_index *index, + struct buffered_writer *writer); + +int __must_check load_open_chapters(struct uds_index *index, + struct buffered_reader *reader); + +uint64_t compute_saved_open_chapter_size(struct geometry *geometry); + +#endif /* OPENCHAPTER_H */ diff --git a/vdo/packedRecoveryJournalBlock.h b/vdo/packed-recovery-journal-block.h similarity index 50% rename from vdo/packedRecoveryJournalBlock.h rename to vdo/packed-recovery-journal-block.h index 35f092a4..a29f3e3b 100644 --- a/vdo/packedRecoveryJournalBlock.h +++ b/vdo/packed-recovery-journal-block.h @@ -1,22 +1,6 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/packedRecoveryJournalBlock.h#7 $ */ #ifndef PACKED_RECOVERY_JOURNAL_BLOCK_H @@ -25,107 +9,109 @@ #include "numeric.h" #include "constants.h" -#include "recoveryJournalEntry.h" +#include "recovery-journal-entry.h" #include "types.h" struct recovery_block_header { - sequence_number_t block_map_head; // Block map head sequence number - sequence_number_t slab_journal_head; // Slab journal head seq. number - sequence_number_t sequence_number; // Sequence number for this block - nonce_t nonce; // A given VDO instance's nonce - block_count_t logical_blocks_used; // Logical blocks in use - block_count_t block_map_data_blocks; // Allocated block map pages - journal_entry_count_t entry_count; // Number of entries written - uint8_t check_byte; // The protection check byte - uint8_t recovery_count; // Number of recoveries completed - enum vdo_metadata_type metadata_type; // Metadata type + sequence_number_t block_map_head; /* Block map head sequence number */ + sequence_number_t slab_journal_head; /* Slab journal head seq. number */ + sequence_number_t sequence_number; /* Sequence number for this block */ + nonce_t nonce; /* A given VDO instance's nonce */ + block_count_t logical_blocks_used; /* Logical blocks in use */ + block_count_t block_map_data_blocks; /* Allocated block map pages */ + journal_entry_count_t entry_count; /* Number of entries written */ + uint8_t check_byte; /* The protection check byte */ + uint8_t recovery_count; /* Number of recoveries completed */ + enum vdo_metadata_type metadata_type; /* Metadata type */ }; -/** +/* * The packed, on-disk representation of a recovery journal block header. * All fields are kept in little-endian byte order. - **/ + */ struct packed_journal_header { - /** Block map head 64-bit sequence number */ + /* Block map head 64-bit sequence number */ __le64 block_map_head; - /** Slab journal head 64-bit sequence number */ + /* Slab journal head 64-bit sequence number */ __le64 slab_journal_head; - /** The 64-bit sequence number for this block */ + /* The 64-bit sequence number for this block */ __le64 sequence_number; - /** A given VDO instance's 64-bit nonce */ + /* A given VDO instance's 64-bit nonce */ __le64 nonce; - /** + /* * 8-bit metadata type (should always be one for the recovery * journal) */ uint8_t metadata_type; - /** 16-bit count of the entries encoded in the block */ + /* 16-bit count of the entries encoded in the block */ __le16 entry_count; - /** + /* * 64-bit count of the logical blocks used when this block was * opened */ __le64 logical_blocks_used; - /** + /* * 64-bit count of the block map blocks used when this block * was opened */ __le64 block_map_data_blocks; - /** The protection check byte */ + /* The protection check byte */ uint8_t check_byte; - /** The number of recoveries completed */ + /* The number of recoveries completed */ uint8_t recovery_count; } __packed; struct packed_journal_sector { - /** The protection check byte */ + /* The protection check byte */ uint8_t check_byte; - /** The number of recoveries completed */ + /* The number of recoveries completed */ uint8_t recovery_count; - /** The number of entries in this sector */ + /* The number of entries in this sector */ uint8_t entry_count; - /** Journal entries for this sector */ + /* Journal entries for this sector */ struct packed_recovery_journal_entry entries[]; } __packed; enum { - // Allowing more than 311 entries in each block changes the math - // concerning the amortization of metadata writes and recovery speed. + /* + * Allowing more than 311 entries in each block changes the math + * concerning the amortization of metadata writes and recovery speed. + */ RECOVERY_JOURNAL_ENTRIES_PER_BLOCK = 311, - /** + /* * The number of entries in each sector (except the last) when filled */ RECOVERY_JOURNAL_ENTRIES_PER_SECTOR = ((VDO_SECTOR_SIZE - sizeof(struct packed_journal_sector)) / sizeof(struct packed_recovery_journal_entry)), - /** The number of entries in the last sector when a block is full */ + /* The number of entries in the last sector when a block is full */ RECOVERY_JOURNAL_ENTRIES_PER_LAST_SECTOR = (RECOVERY_JOURNAL_ENTRIES_PER_BLOCK % RECOVERY_JOURNAL_ENTRIES_PER_SECTOR), }; /** - * Find the recovery journal sector from the block header and sector number. + * vdo_get_journal_block_sector() - Find the recovery journal sector from the + * block header and sector number. + * @header: The header of the recovery journal block. + * @sector_number: The index of the sector (1-based). * - * @param header The header of the recovery journal block - * @param sector_number The index of the sector (1-based) - * - * @return A packed recovery journal sector - **/ + * Return: A packed recovery journal sector. + */ static inline struct packed_journal_sector * __must_check -get_vdo_journal_block_sector(struct packed_journal_header *header, +vdo_get_journal_block_sector(struct packed_journal_header *header, int sector_number) { char *sector_data = @@ -134,13 +120,13 @@ get_vdo_journal_block_sector(struct packed_journal_header *header, } /** - * Generate the packed representation of a recovery block header. - * - * @param header The header containing the values to encode - * @param packed The header into which to pack the values - **/ + * vdo_pack_recovery_block_header() - Generate the packed representation of a + * recovery block header. + * @header: The header containing the values to encode. + * @packed: The header into which to pack the values. + */ static inline void -pack_vdo_recovery_block_header(const struct recovery_block_header *header, +vdo_pack_recovery_block_header(const struct recovery_block_header *header, struct packed_journal_header *packed) { *packed = (struct packed_journal_header) { @@ -160,13 +146,13 @@ pack_vdo_recovery_block_header(const struct recovery_block_header *header, } /** - * Decode the packed representation of a recovery block header. - * - * @param packed The packed header to decode - * @param header The header into which to unpack the values - **/ + * vdo_unpack_recovery_block_header() - Decode the packed representation of a + * recovery block header. + * @packed: The packed header to decode. + * @header: The header into which to unpack the values. + */ static inline void -unpack_vdo_recovery_block_header(const struct packed_journal_header *packed, +vdo_unpack_recovery_block_header(const struct packed_journal_header *packed, struct recovery_block_header *header) { *header = (struct recovery_block_header) { @@ -185,4 +171,4 @@ unpack_vdo_recovery_block_header(const struct packed_journal_header *packed, }; } -#endif // PACKED_RECOVERY_JOURNAL_BLOCK_H +#endif /* PACKED_RECOVERY_JOURNAL_BLOCK_H */ diff --git a/vdo/packed-reference-block.h b/vdo/packed-reference-block.h new file mode 100644 index 00000000..0ea5c772 --- /dev/null +++ b/vdo/packed-reference-block.h @@ -0,0 +1,46 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright Red Hat + */ + +#ifndef PACKED_REFERENCE_BLOCK_H +#define PACKED_REFERENCE_BLOCK_H + +#include "constants.h" +#include "journal-point.h" +#include "types.h" + +/* + * A type representing a reference count of a block. + */ +typedef uint8_t vdo_refcount_t; + +/* + * Special vdo_refcount_t values. + */ +#define EMPTY_REFERENCE_COUNT 0 +enum { + MAXIMUM_REFERENCE_COUNT = 254, + PROVISIONAL_REFERENCE_COUNT = 255, +}; + +enum { + COUNTS_PER_SECTOR = + ((VDO_SECTOR_SIZE - sizeof(struct packed_journal_point)) + / sizeof(vdo_refcount_t)), + COUNTS_PER_BLOCK = COUNTS_PER_SECTOR * VDO_SECTORS_PER_BLOCK, +}; + +/* + * The format of each sector of a reference_block on disk. + */ +struct packed_reference_sector { + struct packed_journal_point commit_point; + vdo_refcount_t counts[COUNTS_PER_SECTOR]; +} __packed; + +struct packed_reference_block { + struct packed_reference_sector sectors[VDO_SECTORS_PER_BLOCK]; +}; + +#endif /* PACKED_REFERENCE_BLOCK_H */ diff --git a/vdo/packedReferenceBlock.h b/vdo/packedReferenceBlock.h deleted file mode 100644 index c4a8436c..00000000 --- a/vdo/packedReferenceBlock.h +++ /dev/null @@ -1,62 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/packedReferenceBlock.h#5 $ - */ - -#ifndef PACKED_REFERENCE_BLOCK_H -#define PACKED_REFERENCE_BLOCK_H - -#include "constants.h" -#include "journalPoint.h" -#include "types.h" - -/** - * A type representing a reference count of a block. - **/ -typedef uint8_t vdo_refcount_t; - -/** - * Special vdo_refcount_t values. - **/ -enum { - EMPTY_REFERENCE_COUNT = 0, - MAXIMUM_REFERENCE_COUNT = 254, - PROVISIONAL_REFERENCE_COUNT = 255, -}; - -enum { - COUNTS_PER_SECTOR = - ((VDO_SECTOR_SIZE - sizeof(struct packed_journal_point)) - / sizeof(vdo_refcount_t)), - COUNTS_PER_BLOCK = COUNTS_PER_SECTOR * VDO_SECTORS_PER_BLOCK, -}; - -/** - * The format of each sector of a reference_block on disk. - **/ -struct packed_reference_sector { - struct packed_journal_point commit_point; - vdo_refcount_t counts[COUNTS_PER_SECTOR]; -} __packed; - -struct packed_reference_block { - struct packed_reference_sector sectors[VDO_SECTORS_PER_BLOCK]; -}; - -#endif // PACKED_REFERENCE_BLOCK_H diff --git a/vdo/packer.c b/vdo/packer.c index 6b8f0334..8ce16639 100644 --- a/vdo/packer.c +++ b/vdo/packer.c @@ -1,46 +1,39 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/packer.c#48 $ */ -#include "packerInternals.h" +#include "packer.h" + +#include #include "logger.h" -#include "memoryAlloc.h" +#include "memory-alloc.h" #include "permassert.h" - -#include "adminState.h" -#include "allocatingVIO.h" -#include "allocationSelector.h" -#include "compressionState.h" -#include "dataVIO.h" -#include "hashLock.h" -#include "pbnLock.h" +#include "string-utils.h" + +#include "admin-state.h" +#include "allocation-selector.h" +#include "completion.h" +#include "constants.h" +#include "compressed-block.h" +#include "compression-state.h" +#include "data-vio.h" +#include "hash-lock.h" +#include "io-submitter.h" +#include "pbn-lock.h" +#include "read-only-notifier.h" +#include "status-codes.h" +#include "thread-config.h" #include "vdo.h" -#include "vdoInternal.h" +#include "vio.h" +#include "vio-write.h" /** - * Check that we are on the packer thread. - * - * @param packer The packer - * @param caller The function which is asserting - **/ + * assert_on_packer_thread() - Check that we are on the packer thread. + * @packer: The packer. + * @caller: The function which is asserting. + */ static inline void assert_on_packer_thread(struct packer *packer, const char *caller) { @@ -49,64 +42,63 @@ static inline void assert_on_packer_thread(struct packer *packer, } /** - * This returns the next bin in the free_space-sorted list. - **/ -static struct input_bin * __must_check -next_vdo_packer_bin(const struct packer *packer, struct input_bin *bin) + * vdo_next_packer_bin() - Return the next bin in the free_space-sorted list. + */ +static struct packer_bin * __must_check +vdo_next_packer_bin(const struct packer *packer, struct packer_bin *bin) { - if (bin->list.next == &packer->input_bins) { + if (bin->list.next == &packer->bins) { return NULL; } else { - return list_entry(bin->list.next, struct input_bin, list); + return list_entry(bin->list.next, struct packer_bin, list); } } /** - * This returns the first bin in the free_space-sorted list. - **/ -static struct input_bin * __must_check -get_vdo_packer_fullest_bin(const struct packer *packer) + * vdo_get_packer_fullest_bin() - Return the first bin in the + * free_space-sorted list. + */ +static struct packer_bin * __must_check +vdo_get_packer_fullest_bin(const struct packer *packer) { - if (list_empty(&packer->input_bins)) { - return NULL; - } else { - return list_entry(packer->input_bins.next, - struct input_bin, list); - } + return (list_empty(&packer->bins) ? + NULL : + list_entry(packer->bins.next, struct packer_bin, list)); } /** - * Insert an input bin to the list, which is in ascending order of free space. - * Since all bins are already in the list, this actually moves the bin to the - * correct position in the list. + * insert_in_sorted_list() - Insert a bin to the list. + * @packer: The packer. + * @bin: The bin to move to its sorted position. * - * @param packer The packer - * @param bin The input bin to move to its sorted position - **/ -static void insert_in_sorted_list(struct packer *packer, struct input_bin *bin) + * The list is in ascending order of free space. Since all bins are already in + * the list, this actually moves the bin to the correct position in the list. + */ +static void insert_in_sorted_list(struct packer *packer, + struct packer_bin *bin) { - struct input_bin *active_bin; - for (active_bin = get_vdo_packer_fullest_bin(packer); + struct packer_bin *active_bin; + + for (active_bin = vdo_get_packer_fullest_bin(packer); active_bin != NULL; - active_bin = next_vdo_packer_bin(packer, active_bin)) { + active_bin = vdo_next_packer_bin(packer, active_bin)) { if (active_bin->free_space > bin->free_space) { list_move_tail(&bin->list, &active_bin->list); return; } } - list_move_tail(&bin->list, &packer->input_bins); + list_move_tail(&bin->list, &packer->bins); } /** - * Allocate an input bin and put it into the packer's list. - * - * @param packer The packer - **/ -static int __must_check make_input_bin(struct packer *packer) + * make_bin() - Allocate a bin and put it into the packer's list. + * @packer: The packer. + */ +static int __must_check make_bin(struct packer *packer) { - struct input_bin *bin; - int result = UDS_ALLOCATE_EXTENDED(struct input_bin, + struct packer_bin *bin; + int result = UDS_ALLOCATE_EXTENDED(struct packer_bin, VDO_MAX_COMPRESSION_SLOTS, struct vio *, __func__, &bin); if (result != VDO_SUCCESS) { @@ -115,140 +107,46 @@ static int __must_check make_input_bin(struct packer *packer) bin->free_space = packer->bin_data_size; INIT_LIST_HEAD(&bin->list); - list_add_tail(&bin->list, &packer->input_bins); + list_add_tail(&bin->list, &packer->bins); return VDO_SUCCESS; } /** - * Push an output bin onto the stack of idle bins. + * vdo_make_packer() - Make a new block packer. * - * @param packer The packer - * @param bin The output bin - **/ -static void push_output_bin(struct packer *packer, struct output_bin *bin) -{ - ASSERT_LOG_ONLY(!has_waiters(&bin->outgoing), - "idle output bin has no waiters"); - packer->idle_output_bins[packer->idle_output_bin_count++] = bin; -} - -/** - * Pop an output bin off the end of the stack of idle bins. - * - * @param packer The packer + * @vdo: The vdo to which this packer belongs. + * @bin_count: The number of partial bins to keep in memory. + * @packer_ptr: A pointer to hold the new packer. * - * @return an idle output bin, or NULL if there are no idle bins - **/ -static struct output_bin * __must_check pop_output_bin(struct packer *packer) -{ - size_t index; - struct output_bin *bin; - - if (packer->idle_output_bin_count == 0) { - return NULL; - } - - index = --packer->idle_output_bin_count; - bin = packer->idle_output_bins[index]; - packer->idle_output_bins[index] = NULL; - return bin; -} - -/** - * Allocate a new output bin and push it onto the packer's stack of idle bins. - * - * @param packer The packer - * @param vdo The vdo to which this packer belongs - * - * @return VDO_SUCCESS or an error code - **/ -static int __must_check -make_output_bin(struct packer *packer, struct vdo *vdo) -{ - struct output_bin *output; - int result = UDS_ALLOCATE(1, struct output_bin, __func__, &output); - if (result != VDO_SUCCESS) { - return result; - } - - // Add the bin to the stack even before it's fully initialized so it - // will be freed even if we fail to initialize it below. - INIT_LIST_HEAD(&output->list); - list_add_tail(&output->list, &packer->output_bins); - push_output_bin(packer, output); - - result = UDS_ALLOCATE_EXTENDED(struct compressed_block, - packer->bin_data_size, - char, "compressed block", - &output->block); - if (result != VDO_SUCCESS) { - return result; - } - - return create_compressed_write_vio(vdo, - output, - (char *) output->block, - &output->writer); -} - -/** - * Free an idle output bin. - * - * @param bin The output bin to free - **/ -static void free_output_bin(struct output_bin *bin) -{ - if (bin == NULL) { - return; - } - - list_del_init(&bin->list); - - free_vio(allocating_vio_as_vio(UDS_FORGET(bin->writer))); - UDS_FREE(UDS_FORGET(bin->block)); - UDS_FREE(bin); -} - -/**********************************************************************/ -int make_vdo_packer(struct vdo *vdo, - block_count_t input_bin_count, - block_count_t output_bin_count, + * Return: VDO_SUCCESS or an error + */ +int vdo_make_packer(struct vdo *vdo, + block_count_t bin_count, struct packer **packer_ptr) { - const struct thread_config *thread_config = get_vdo_thread_config(vdo); - struct packer *packer; block_count_t i; - int result = UDS_ALLOCATE_EXTENDED(struct packer, output_bin_count, - struct output_bin *, __func__, - &packer); + int result = UDS_ALLOCATE(1, struct packer, __func__, &packer); + if (result != VDO_SUCCESS) { return result; } - packer->thread_id = thread_config->packer_thread; + packer->thread_id = vdo->thread_config->packer_thread; packer->bin_data_size = (VDO_BLOCK_SIZE - sizeof(struct compressed_block_header)); - packer->size = input_bin_count; + packer->size = bin_count; packer->max_slots = VDO_MAX_COMPRESSION_SLOTS; - packer->output_bin_count = output_bin_count; - INIT_LIST_HEAD(&packer->input_bins); - INIT_LIST_HEAD(&packer->output_bins); - set_vdo_admin_state_code(&packer->state, + INIT_LIST_HEAD(&packer->bins); + vdo_set_admin_state_code(&packer->state, VDO_ADMIN_STATE_NORMAL_OPERATION); - result = make_vdo_allocation_selector(thread_config->physical_zone_count, - packer->thread_id, &packer->selector); - if (result != VDO_SUCCESS) { - free_vdo_packer(packer); - return result; - } + for (i = 0; i < bin_count; i++) { + int result = make_bin(packer); - for (i = 0; i < input_bin_count; i++) { - int result = make_input_bin(packer); if (result != VDO_SUCCESS) { - free_vdo_packer(packer); + vdo_free_packer(packer); return result; } } @@ -258,81 +156,83 @@ int make_vdo_packer(struct vdo *vdo, * canceled vio in the bin must have a canceler for which it is waiting, * and any canceler will only have canceled one lock holder at a time. */ - result = UDS_ALLOCATE_EXTENDED(struct input_bin, + result = UDS_ALLOCATE_EXTENDED(struct packer_bin, MAXIMUM_VDO_USER_VIOS / 2, struct vio *, __func__, &packer->canceled_bin); if (result != VDO_SUCCESS) { - free_vdo_packer(packer); + vdo_free_packer(packer); return result; } - for (i = 0; i < output_bin_count; i++) { - int result = make_output_bin(packer, vdo); - if (result != VDO_SUCCESS) { - free_vdo_packer(packer); - return result; - } + result = vdo_make_default_thread(vdo, packer->thread_id); + if (result != VDO_SUCCESS) { + vdo_free_packer(packer); + return result; } *packer_ptr = packer; return VDO_SUCCESS; } -/**********************************************************************/ -void free_vdo_packer(struct packer *packer) +/** + * vdo_free_packer() - Free a block packer. + * @packer: The packer to free. + */ +void vdo_free_packer(struct packer *packer) { - struct input_bin *input; - struct output_bin *output; + struct packer_bin *bin; if (packer == NULL) { return; } - while ((input = get_vdo_packer_fullest_bin(packer)) != NULL) { - list_del_init(&input->list); - UDS_FREE(input); + while ((bin = vdo_get_packer_fullest_bin(packer)) != NULL) { + list_del_init(&bin->list); + UDS_FREE(bin); } UDS_FREE(UDS_FORGET(packer->canceled_bin)); - - while ((output = pop_output_bin(packer)) != NULL) { - free_output_bin(output); - } - - UDS_FREE(UDS_FORGET(packer->selector)); UDS_FREE(packer); } /** - * Get the packer from a data_vio. + * get_packer_from_data_vio() - Get the packer from a data_vio. + * @data_vio: The data_vio. * - * @param data_vio The data_vio - * - * @return The packer from the VDO to which the data_vio belongs - **/ -static inline struct packer *get_packer_from_data_vio(struct data_vio *data_vio) + * Return: The packer from the VDO to which the data_vio belongs. + */ +static inline struct packer * +get_packer_from_data_vio(struct data_vio *data_vio) { - return get_vdo_from_data_vio(data_vio)->packer; + return vdo_from_data_vio(data_vio)->packer; } -/**********************************************************************/ +/** + * vdo_data_is_sufficiently_compressible() - Check whether the compressed data + * in a data_vio will fit in a + * packer bin. + * @data_vio: The data_vio. + * + * Return: true if the data_vio will fit in a bin. + */ bool vdo_data_is_sufficiently_compressible(struct data_vio *data_vio) { struct packer *packer = get_packer_from_data_vio(data_vio); - return (data_vio->compression.size < packer->bin_data_size); -} -/**********************************************************************/ -thread_id_t get_vdo_packer_thread_id(struct packer *packer) -{ - return packer->thread_id; + return (data_vio->compression.size < packer->bin_data_size); } -/**********************************************************************/ -struct packer_statistics get_vdo_packer_statistics(const struct packer *packer) +/** + * vdo_get_packer_statistics() - Get the current statistics from the packer. + * @packer: The packer to query. + * + * Return: a copy of the current statistics for the packer. + */ +struct packer_statistics vdo_get_packer_statistics(const struct packer *packer) { const struct packer_statistics *stats = &packer->statistics; + return (struct packer_statistics) { .compressed_fragments_written = READ_ONCE(stats->compressed_fragments_written), @@ -344,10 +244,9 @@ struct packer_statistics get_vdo_packer_statistics(const struct packer *packer) } /** - * Abort packing a data_vio. - * - * @param data_vio The data_vio to abort - **/ + * abort_packing() - Abort packing a data_vio. + * @data_vio: The data_vio to abort. + */ static void abort_packing(struct data_vio *data_vio) { struct packer *packer = get_packer_from_data_vio(data_vio); @@ -357,465 +256,309 @@ static void abort_packing(struct data_vio *data_vio) WRITE_ONCE(packer->statistics.compressed_fragments_in_packer, packer->statistics.compressed_fragments_in_packer - 1); - continue_data_vio(data_vio, VDO_SUCCESS); + continue_write_after_compression(data_vio); } /** - * This continues the vio completion without packing the vio. - * - * @param waiter The wait queue entry of the vio to continue - * @param unused An argument required so this function may be called - * from notify_all_waiters - **/ -static void continue_vio_without_packing(struct waiter *waiter, - void *unused __always_unused) + * release_compressed_write_waiter() - Update a data_vio for which a + * successful compressed write has + * completed and send it on its way. + * @data_vio: The data_vio to release. + * @allocation: The allocation to which the compressed block was written. + */ +static void release_compressed_write_waiter(struct data_vio *data_vio, + struct allocation *allocation) { - abort_packing(waiter_as_data_vio(waiter)); -} + data_vio->new_mapped = (struct zoned_pbn) { + .pbn = allocation->pbn, + .zone = allocation->zone, + .state = vdo_get_state_for_slot(data_vio->compression.slot), + }; -/** - * Check whether the packer has drained. - * - * @param packer The packer - **/ -static void vdo_check_for_drain_complete(struct packer *packer) -{ - if (is_vdo_state_draining(&packer->state) && - (packer->canceled_bin->slots_used == 0) && - (packer->idle_output_bin_count == packer->output_bin_count)) { - finish_vdo_draining(&packer->state); - } + vdo_share_compressed_write_lock(data_vio, allocation->lock); + continue_write_after_compression(data_vio); } -/**********************************************************************/ -static void write_pending_batches(struct packer *packer); - /** - * Ensure that a completion is running on the packer thread. + * finish_compressed_write() - Finish a compressed block write. + * @completion: The compressed write completion. * - * @param completion The compressed write vio - * - * @return true if the completion is on the packer thread - **/ -static bool __must_check -switch_to_packer_thread(struct vdo_completion *completion) + * This callback is registered in continue_after_allocation(). + */ +static void finish_compressed_write(struct vdo_completion *completion) { - struct vio *vio = as_vio(completion); - thread_id_t thread_id = vio->vdo->packer->thread_id; - if (completion->callback_thread_id == thread_id) { - return true; - } + struct data_vio *agent = as_data_vio(completion); + struct data_vio *client, *next; - completion->callback_thread_id = thread_id; - invoke_vdo_completion_callback(completion); - return false; -} + assert_data_vio_in_allocated_zone(agent); -/** - * Finish processing an output bin whose write has completed. If there was - * an error, any data_vios waiting on the bin write will be notified. - * - * @param packer The packer which owns the bin - * @param bin The bin which has finished - **/ -static void finish_output_bin(struct packer *packer, struct output_bin *bin) -{ - if (has_waiters(&bin->outgoing)) { - notify_all_waiters(&bin->outgoing, continue_vio_without_packing, - NULL); - } else { - // No waiters implies no error, so the compressed block was - // written. - struct packer_statistics *stats = &packer->statistics; - WRITE_ONCE(stats->compressed_fragments_in_packer, - stats->compressed_fragments_in_packer - - bin->slots_used); - WRITE_ONCE(stats->compressed_fragments_written, - stats->compressed_fragments_written - + bin->slots_used); - WRITE_ONCE(stats->compressed_blocks_written, - stats->compressed_blocks_written + 1); + /* + * Process all the non-agent waiters first to ensure that the pbn + * lock can not be released until all of them have had a chance to + * journal their increfs. + */ + for (client = agent->compression.next_in_batch; + client != NULL; + client = next) { + next = client->compression.next_in_batch; + release_compressed_write_waiter(client, &agent->allocation); } - bin->slots_used = 0; - push_output_bin(packer, bin); + completion->error_handler = NULL; + release_compressed_write_waiter(agent, &agent->allocation); } -/** - * This finishes the bin write process after the bin is written to disk. This - * is the vio callback function registered by launch_compressed_write(). - * - * @param completion The compressed write vio - **/ -static void complete_output_bin(struct vdo_completion *completion) +static void handle_compressed_write_error(struct vdo_completion *completion) { - struct vio *vio = as_vio(completion); - struct packer *packer = vio->vdo->packer; + struct data_vio *agent = as_data_vio(completion); + struct allocation *allocation = &agent->allocation; + struct data_vio *client, *next; - if (!switch_to_packer_thread(completion)) { + if (vdo_get_callback_thread_id() != allocation->zone->thread_id) { + completion->callback_thread_id = allocation->zone->thread_id; + vdo_continue_completion(completion, VDO_SUCCESS); return; } - if (completion->result != VDO_SUCCESS) { - update_vio_error_stats(vio, - "Completing compressed write vio for physical block %llu with error", - (unsigned long long) vio->physical); - } + update_vio_error_stats(as_vio(completion), + "Completing compressed write vio for physical block %llu with error", + (unsigned long long) allocation->pbn); - finish_output_bin(packer, completion->parent); - write_pending_batches(packer); - vdo_check_for_drain_complete(packer); -} + for (client = agent->compression.next_in_batch; + client != NULL; + client = next) { + next = client->compression.next_in_batch; + continue_write_after_compression(client); + } -/** - * Implements waiter_callback. Continues the data_vio waiter. - **/ -static void continue_waiter(struct waiter *waiter, - void *context __always_unused) -{ - struct data_vio *data_vio = waiter_as_data_vio(waiter); - continue_data_vio(data_vio, VDO_SUCCESS); + /* + * Now that we've released the batch from the packer, forget the error + * and continue on. + */ + vdo_reset_completion(completion); + completion->error_handler = NULL; + continue_write_after_compression(agent); } /** - * Implements waiter_callback. Updates the data_vio waiter to refer to its slot - * in the compressed block, gives the data_vio a share of the PBN lock on that - * block, and reserves a reference count increment on the lock. - **/ -static void share_compressed_block(struct waiter *waiter, void *context) + * add_to_bin() - Put a data_vio in a specific packer_bin in which it will + * definitely fit. + * @bin: The bin in which to put the data_vio. + * @data_vio: The data_vio to add. + */ +static void add_to_bin(struct packer_bin *bin, struct data_vio *data_vio) { - int result; - struct data_vio *data_vio = waiter_as_data_vio(waiter); - struct output_bin *bin = context; - - data_vio->new_mapped = (struct zoned_pbn) { - .pbn = bin->writer->allocation, - .zone = bin->writer->zone, - .state = vdo_get_state_for_slot(data_vio->compression.slot), - }; - data_vio_as_vio(data_vio)->physical = data_vio->new_mapped.pbn; - - share_compressed_vdo_write_lock(data_vio, bin->writer->allocation_lock); - - // Wait again for all the waiters to get a share. - result = enqueue_waiter(&bin->outgoing, waiter); - // Cannot fail since this waiter was just dequeued. - ASSERT_LOG_ONLY(result == VDO_SUCCESS, - "impossible enqueue_waiter error"); + data_vio->compression.bin = bin; + data_vio->compression.slot = bin->slots_used; + bin->incoming[bin->slots_used++] = data_vio; } /** - * Finish a compressed block write. This callback is registered in - * continue_after_allocation(). + * remove_from_bin() - Get the next data_vio whose compression has not been + * canceled from a bin. + * @packer: The packer. + * @bin: The bin from which to get a data_vio. * - * @param completion The compressed write completion - **/ -static void finish_compressed_write(struct vdo_completion *completion) + * Any canceled data_vios will be moved to the canceled bin. + * Return: An uncanceled data_vio from the bin or NULL if there are none. + */ +static struct data_vio *remove_from_bin(struct packer *packer, + struct packer_bin *bin) { - struct output_bin *bin = completion->parent; - assert_vio_in_physical_zone(bin->writer); - - if (completion->result != VDO_SUCCESS) { - vio_release_allocation_lock(bin->writer); - // Invokes complete_output_bin() on the packer thread, which - // will deal with the waiters. - vio_done_callback(completion); - return; - } - - // First give every data_vio/hash_lock a share of the PBN lock to - // ensure it can't be released until they've all done their incRefs. - notify_all_waiters(&bin->outgoing, share_compressed_block, bin); - - // The waiters now hold the (downgraded) PBN lock. - bin->writer->allocation_lock = NULL; + while (bin->slots_used > 0) { + struct data_vio *data_vio = bin->incoming[--bin->slots_used]; - // Invokes the callbacks registered before entering the packer. - notify_all_waiters(&bin->outgoing, continue_waiter, NULL); - - // Invokes complete_output_bin() on the packer thread. - vio_done_callback(completion); -} + if (may_write_compressed_data_vio(data_vio)) { + data_vio->compression.bin = NULL; + return data_vio; + } -/** - * Continue the write path for a compressed write allocating_vio now that block - * allocation is complete (the allocating_vio may or may not have actually - * received an allocation). - * - * @param allocating_vio The allocating_vio which has finished the allocation - * process - **/ -static void continue_after_allocation(struct allocating_vio *allocating_vio) -{ - struct vio *vio = allocating_vio_as_vio(allocating_vio); - struct vdo_completion *completion = vio_as_completion(vio); - if (allocating_vio->allocation == VDO_ZERO_BLOCK) { - completion->requeue = true; - set_vdo_completion_result(completion, VDO_NO_SPACE); - vio_done_callback(completion); - return; + add_to_bin(packer->canceled_bin, data_vio); } - vio_set_physical_zone_callback(allocating_vio, - finish_compressed_write); - write_compressed_block_vio(vio); + /* The bin is now empty. */ + bin->free_space = packer->bin_data_size; + return NULL; } /** - * Launch an output bin. - * - * @param packer The packer which owns the bin - * @param bin The output bin to launch - **/ -static void launch_compressed_write(struct packer *packer, - struct output_bin *bin) -{ - struct vio *vio; - - if (vdo_is_read_only(get_vdo_from_allocating_vio(bin->writer)->read_only_notifier)) { - finish_output_bin(packer, bin); - return; - } - - vio = allocating_vio_as_vio(bin->writer); - reset_vdo_completion(vio_as_completion(vio)); - vio->callback = complete_output_bin; - vio->priority = VIO_PRIORITY_COMPRESSED_DATA; - vio_allocate_data_block(bin->writer, packer->selector, - VIO_COMPRESSED_WRITE_LOCK, - continue_after_allocation); + * pack_fragment() - Pack a data_vio's fragment into the compressed block in + * which it is already known to fit. + * @compression: The agent's compression_state to pack in to. + * @data_vio: The data_vio to pack. + * @offset: The offset into the compressed block at which to pack the fragment. + * @compressed_block: The compressed block which will be written out when + * batch is fully packed. + * + * Return: The new amount of space used. + */ +static block_size_t pack_fragment(struct compression_state *compression, + struct data_vio *data_vio, + block_size_t offset, + slot_number_t slot, + struct compressed_block *block) +{ + struct compression_state *to_pack = &data_vio->compression; + char *fragment = to_pack->block->data; + + to_pack->next_in_batch = compression->next_in_batch; + compression->next_in_batch = data_vio; + to_pack->slot = slot; + vdo_put_compressed_block_fragment(block, + slot, + offset, + fragment, + to_pack->size); + return (offset + to_pack->size); } /** - * Consume from the pending queue the next batch of vios that can be packed - * together in a single compressed block. vios that have been mooted since - * being placed in the pending queue will not be returned. - * - * @param packer The packer - * @param batch The counted array to fill with the next batch of vios - **/ -static void get_next_batch(struct packer *packer, struct output_batch *batch) + * compressed_write_end_io() - The bio_end_io for a compressed block write. + * @bio: The bio for the compressed write. + */ +static void compressed_write_end_io(struct bio *bio) { - struct data_vio *data_vio; - - block_size_t space_remaining = packer->bin_data_size; - batch->slots_used = 0; - - while ((data_vio = - waiter_as_data_vio(get_first_waiter(&packer->batched_data_vios))) - != NULL) { - // If there's not enough space for the next data_vio, the batch - // is done. - if ((data_vio->compression.size > space_remaining) || - (batch->slots_used == packer->max_slots)) { - break; - } + struct data_vio *data_vio = vio_as_data_vio(bio->bi_private); - // Remove the next data_vio from the queue and put it in the - // output batch. - dequeue_next_waiter(&packer->batched_data_vios); - batch->slots[batch->slots_used++] = data_vio; - space_remaining -= data_vio->compression.size; - } + vdo_count_completed_bios(bio); + set_data_vio_allocated_zone_callback(data_vio, + finish_compressed_write); + continue_data_vio(data_vio, vdo_get_bio_result(bio)); } /** - * Pack the next batch of compressed vios from the batched queue into an - * output bin and write the output bin. - * - * @param packer The packer - * @param output The output bin to fill - * - * @return true if a write was issued for the output bin - **/ -static bool __must_check -write_next_batch(struct packer *packer, struct output_bin *output) + * write_bin() - Write out a bin. + * @packer: The packer. + * @bin: The bin to write. + */ +static void write_bin(struct packer *packer, struct packer_bin *bin) { - size_t space_used = 0; - struct output_batch batch; - slot_number_t slot; int result; + block_size_t offset; + slot_number_t slot = 1; + struct compression_state *compression; + struct compressed_block *block; + struct data_vio *agent = remove_from_bin(packer, bin); + struct data_vio *client; + struct packer_statistics *stats; + struct vdo *vdo; + + if (agent == NULL) { + return; + } - get_next_batch(packer, &batch); + compression = &agent->compression; + compression->slot = 0; + block = compression->block; + vdo_initialize_compressed_block(block, compression->size); + offset = compression->size; - if (batch.slots_used == 0) { - // The pending queue must now be empty (there may have been - // mooted vios). - return false; + while ((client = remove_from_bin(packer, bin)) != NULL) { + offset = pack_fragment(compression, + client, + offset, + slot++, + block); } - // If the batch contains only a single vio, then we save nothing by - // saving the compressed form. Continue processing the single vio in - // the batch. - if (batch.slots_used == 1) { - abort_packing(batch.slots[0]); - return false; + /* + * If the batch contains only a single vio, then we save nothing by + * saving the compressed form. Continue processing the single vio in + * the batch. + */ + if (slot == 1) { + abort_packing(agent); + return; } - reset_vdo_compressed_block_header(&output->block->header); - - for (slot = 0; slot < batch.slots_used; slot++) { - struct data_vio *data_vio = batch.slots[slot]; - data_vio->compression.slot = slot; - put_vdo_compressed_block_fragment(output->block, slot, - space_used, - data_vio->compression.data, - data_vio->compression.size); - space_used += data_vio->compression.size; - - result = enqueue_data_vio(&output->outgoing, data_vio); - if (result != VDO_SUCCESS) { - abort_packing(data_vio); - continue; - } - - output->slots_used += 1; + vdo_clear_unused_compression_slots(block, slot); + data_vio_as_completion(agent)->error_handler = + handle_compressed_write_error; + vdo = vdo_from_data_vio(agent); + if (vdo_is_read_only(vdo->read_only_notifier)) { + continue_data_vio(agent, VDO_READ_ONLY); + return; } - launch_compressed_write(packer, output); - return true; -} - -/** - * Put a data_vio in a specific input_bin in which it will definitely fit. - * - * @param bin The bin in which to put the data_vio - * @param data_vio The data_vio to add - **/ -static void add_to_input_bin(struct input_bin *bin, struct data_vio *data_vio) -{ - data_vio->compression.bin = bin; - data_vio->compression.slot = bin->slots_used; - bin->incoming[bin->slots_used++] = data_vio; -} - -/** - * Start a new batch of vios in an input_bin, moving the existing batch, if - * any, to the queue of pending batched vios in the packer. - * - * @param packer The packer - * @param bin The bin to prepare - **/ -static void start_new_batch(struct packer *packer, struct input_bin *bin) -{ - slot_number_t slot; - int result; - // Move all the data_vios in the current batch to the batched queue so - // they will get packed into the next free output bin. - for (slot = 0; slot < bin->slots_used; slot++) { - struct data_vio *data_vio = bin->incoming[slot]; - data_vio->compression.bin = NULL; - - if (!may_write_compressed_data_vio(data_vio)) { - /* - * Compression of this data_vio was canceled while it - * was waiting; put it in the canceled bin so it can be - * rendezvous with the canceling data_vio. - */ - add_to_input_bin(packer->canceled_bin, data_vio); - continue; - } - - result = enqueue_data_vio(&packer->batched_data_vios, - data_vio); - if (result != VDO_SUCCESS) { - // Impossible but we're required to check the result - // from enqueue. - abort_packing(data_vio); - } + result = prepare_data_vio_for_io(agent, + (char *) block, + compressed_write_end_io, + REQ_OP_WRITE, + agent->allocation.pbn); + if (result != VDO_SUCCESS) { + continue_data_vio(agent, result); + return; } - // The bin is now empty. - bin->slots_used = 0; - bin->free_space = packer->bin_data_size; + /* + * Once the compressed write is submitted, the fragments are no longer + * in the packer, so update stats now. + */ + stats = &packer->statistics; + WRITE_ONCE(stats->compressed_fragments_in_packer, + (stats->compressed_fragments_in_packer - slot)); + WRITE_ONCE(stats->compressed_fragments_written, + (stats->compressed_fragments_written + slot)); + WRITE_ONCE(stats->compressed_blocks_written, + stats->compressed_blocks_written + 1); + + submit_data_vio_io(agent); } /** - * Add a data_vio to a bin's incoming queue, handle logical space change, and - * call physical space processor. + * add_data_vio_to_packer_bin() - Add a data_vio to a bin's incoming queue + * @packer: The packer. + * @bin: The bin to which to add the data_vio. + * @data_vio: The data_vio to add to the bin's queue. * - * @param packer The packer - * @param bin The bin to which to add the the data_vio - * @param data_vio The data_vio to add to the bin's queue - **/ -static void add_data_vio_to_input_bin(struct packer *packer, - struct input_bin *bin, - struct data_vio *data_vio) + * Adds a data_vio to a bin's incoming queue, handles logical space change, + * and calls physical space processor. + */ +static void add_data_vio_to_packer_bin(struct packer *packer, + struct packer_bin *bin, + struct data_vio *data_vio) { - // If the selected bin doesn't have room, start a new batch to make - // room. + /* + * If the selected bin doesn't have room, start a new batch to make + * room. + */ if (bin->free_space < data_vio->compression.size) { - start_new_batch(packer, bin); + write_bin(packer, bin); } - add_to_input_bin(bin, data_vio); + add_to_bin(bin, data_vio); bin->free_space -= data_vio->compression.size; - // If we happen to exactly fill the bin, start a new input batch. + /* If we happen to exactly fill the bin, start a new batch. */ if ((bin->slots_used == packer->max_slots) || (bin->free_space == 0)) { - start_new_batch(packer, bin); + write_bin(packer, bin); } - // Now that we've finished changing the free space, restore the sort - // order. + /* + * Now that we've finished changing the free space, restore the sort + * order. + */ insert_in_sorted_list(packer, bin); } /** - * Move data_vios in pending batches from the batched_data_vios to all free - * output bins, issuing writes for the output bins as they are packed. This - * will loop until either the pending queue is drained or all output bins are - * busy writing a compressed block. - * - * @param packer The packer - **/ -static void write_pending_batches(struct packer *packer) + * select_bin() - Select the bin that should be used to pack the compressed + * data in a data_vio with other data_vios. + * @packer: The packer. + * @data_vio: The data_vio. + */ +static struct packer_bin * __must_check +select_bin(struct packer *packer, struct data_vio *data_vio) { - struct output_bin *output; - - if (packer->writing_batches) { - /* - * We've attempted to re-enter this function recursively due to - * completion handling, which can lead to kernel stack overflow - * as in VDO-1340. It's perfectly safe to break the recursion - * and do nothing since we know any pending batches will - * eventually be handled by the earlier call. - */ - return; - } - - // Record that we are in this function for the above check. IMPORTANT: - // never return from this function without clearing this flag. - packer->writing_batches = true; - - while (has_waiters(&packer->batched_data_vios) - && ((output = pop_output_bin(packer)) != NULL)) { - if (!write_next_batch(packer, output)) { - // We didn't use the output bin to write, so push it - // back on the stack. - push_output_bin(packer, output); - } - } - - packer->writing_batches = false; -} + /* + * First best fit: select the bin with the least free space that has + * enough room for the compressed data in the data_vio. + */ + struct packer_bin *fullest_bin = vdo_get_packer_fullest_bin(packer); + struct packer_bin *bin; -/** - * Select the input bin that should be used to pack the compressed data in a - * data_vio with other data_vios. - * - * @param packer The packer - * @param data_vio The data_vio - **/ -static struct input_bin * __must_check -select_input_bin(struct packer *packer, struct data_vio *data_vio) -{ - // First best fit: select the bin with the least free space that has - // enough room for the compressed data in the data_vio. - struct input_bin *fullest_bin = get_vdo_packer_fullest_bin(packer); - struct input_bin *bin; for (bin = fullest_bin; bin != NULL; - bin = next_vdo_packer_bin(packer, bin)) { + bin = vdo_next_packer_bin(packer, bin)) { if (bin->free_space >= data_vio->compression.size) { return bin; } @@ -829,7 +572,7 @@ select_input_bin(struct packer *packer, struct data_vio *data_vio) * compressed block. But if the space currently used in the fullest * bin is smaller than the compressed size of the incoming block, it * seems wrong to force that bin to write when giving up on - * compressing the incoming data_vio would likewise "waste" the the + * compressing the incoming data_vio would likewise "waste" the * least amount of free space. */ if (data_vio->compression.size @@ -837,25 +580,32 @@ select_input_bin(struct packer *packer, struct data_vio *data_vio) return NULL; } - // The fullest bin doesn't have room, but writing it out and starting a - // new batch with the incoming data_vio will increase the packer's free - // space. + /* + * The fullest bin doesn't have room, but writing it out and starting a + * new batch with the incoming data_vio will increase the packer's free + * space. + */ return fullest_bin; } -/**********************************************************************/ +/** + * vdo_attempt_packing() - Attempt to rewrite the data in this data_vio as + * part of a compressed block. + * @data_vio: The data_vio to pack. + */ void vdo_attempt_packing(struct data_vio *data_vio) { int result; - struct input_bin *bin; + struct packer_bin *bin; struct vio_compression_state state = get_vio_compression_state(data_vio); struct packer *packer = get_packer_from_data_vio(data_vio); + assert_on_packer_thread(packer, __func__); result = ASSERT((state.status == VIO_COMPRESSING), - "attempt to pack data_vio not ready for packing, state: %u", - state.status); + "attempt to pack data_vio not ready for packing, state: %u", + state.status); if (result != VDO_SUCCESS) { return; } @@ -867,9 +617,11 @@ void vdo_attempt_packing(struct data_vio *data_vio) WRITE_ONCE(packer->statistics.compressed_fragments_in_packer, packer->statistics.compressed_fragments_in_packer + 1); - // If packing of this data_vio is disallowed for administrative - // reasons, give up before making any state changes. - if (!is_vdo_state_normal(&packer->state) || + /* + * If packing of this data_vio is disallowed for administrative + * reasons, give up before making any state changes. + */ + if (!vdo_is_state_normal(&packer->state) || (data_vio->flush_generation < packer->flush_generation)) { abort_packing(data_vio); return; @@ -880,192 +632,206 @@ void vdo_attempt_packing(struct data_vio *data_vio) * compression state to VIO_PACKING if the data_vio is allowed to be * compressed (if it has already been canceled, we'll fall out here). * Once the data_vio is in the VIO_PACKING state, it must be guaranteed - * to be put in an input bin before any more requests can be processed - * by the packer thread. Otherwise, a canceling data_vio could attempt - * to remove the canceled data_vio from the packer and fail to - * rendezvous with it (VDO-2809). We must also make sure that we will - * actually bin the data_vio and not give up on it as being larger than - * the space used in the fullest bin. Hence we must call - * select_input_bin() before calling may_vio_block_in_packer() - * (VDO-2826). + * to be put in a bin before any more requests can be processed by the + * packer thread. Otherwise, a canceling data_vio could attempt to + * remove the canceled data_vio from the packer and fail to rendezvous + * with it (VDO-2809). We must also make sure that we will actually bin + * the data_vio and not give up on it as being larger than the space + * used in the fullest bin. Hence we must call select_bin() before + * calling may_vio_block_in_packer() (VDO-2826). */ - bin = select_input_bin(packer, data_vio); + bin = select_bin(packer, data_vio); if ((bin == NULL) || !may_vio_block_in_packer(data_vio)) { abort_packing(data_vio); return; } - add_data_vio_to_input_bin(packer, bin, data_vio); - write_pending_batches(packer); + add_data_vio_to_packer_bin(packer, bin, data_vio); } /** - * Force a pending write for all non-empty bins on behalf of a flush or - * suspend. - * - * @param packer The packer being flushed - **/ + * check_for_drain_complete() - Check whether the packer has drained. + * @packer: The packer. + */ +static void check_for_drain_complete(struct packer *packer) +{ + if (vdo_is_state_draining(&packer->state) && + (packer->canceled_bin->slots_used == 0)) { + vdo_finish_draining(&packer->state); + } +} + +/** + * write_all_non_empty_bins() - Write out all non-empty bins on behalf of a + * flush or suspend. + * @packer: The packer being flushed. + */ static void write_all_non_empty_bins(struct packer *packer) { - struct input_bin *bin; - for (bin = get_vdo_packer_fullest_bin(packer); bin != NULL; - bin = next_vdo_packer_bin(packer, bin)) { - start_new_batch(packer, bin); - // We don't need to re-sort the bin here since this loop will - // make every bin have the same amount of free space, so every - // ordering is sorted. + struct packer_bin *bin; + + for (bin = vdo_get_packer_fullest_bin(packer); + bin != NULL; + bin = vdo_next_packer_bin(packer, bin)) { + write_bin(packer, bin); + /* + * We don't need to re-sort the bin here since this loop will + * make every bin have the same amount of free space, so every + * ordering is sorted. + */ } - write_pending_batches(packer); + check_for_drain_complete(packer); } -/**********************************************************************/ -void flush_vdo_packer(struct packer *packer) +/** + * vdo_flush_packer() - Request that the packer flush asynchronously. + * @packer: The packer to flush. + * + * All bins with at least two compressed data blocks will be written out, and + * any solitary pending VIOs will be released from the packer. While flushing + * is in progress, any VIOs submitted to vdo_attempt_packing() will be + * continued immediately without attempting to pack them. + */ +void vdo_flush_packer(struct packer *packer) { assert_on_packer_thread(packer, __func__); - if (is_vdo_state_normal(&packer->state)) { + if (vdo_is_state_normal(&packer->state)) { write_all_non_empty_bins(packer); } } /** - * Remove a data_vio from the packer. - * - * @param data_vio The data_vio to remove - **/ -static void remove_from_vdo_packer(struct data_vio *data_vio) + * vdo_remove_lock_holder_from_packer() - Remove a lock holder from the packer. + * @completion: The data_vio which needs a lock held by a data_vio in the + * packer. The data_vio's compression.lock_holder field will + * point to the data_vio to remove. + */ +void vdo_remove_lock_holder_from_packer(struct vdo_completion *completion) { + struct data_vio *data_vio = as_data_vio(completion); struct packer *packer = get_packer_from_data_vio(data_vio); - struct input_bin *bin = data_vio->compression.bin; - slot_number_t slot = data_vio->compression.slot; - ASSERT_LOG_ONLY((bin != NULL), "data_vio in packer has an input bin"); + struct data_vio *lock_holder; + struct packer_bin *bin; + slot_number_t slot; + + assert_data_vio_in_packer_zone(data_vio); + lock_holder = UDS_FORGET(data_vio->compression.lock_holder); + bin = lock_holder->compression.bin; + ASSERT_LOG_ONLY((bin != NULL), "data_vio in packer has a bin"); + + slot = lock_holder->compression.slot; bin->slots_used--; if (slot < bin->slots_used) { bin->incoming[slot] = bin->incoming[bin->slots_used]; bin->incoming[slot]->compression.slot = slot; } - data_vio->compression.bin = NULL; - data_vio->compression.slot = 0; + lock_holder->compression.bin = NULL; + lock_holder->compression.slot = 0; if (bin != packer->canceled_bin) { - bin->free_space += data_vio->compression.size; + bin->free_space += lock_holder->compression.size; insert_in_sorted_list(packer, bin); } - abort_packing(data_vio); - vdo_check_for_drain_complete(packer); + abort_packing(lock_holder); + check_for_drain_complete(packer); } -/**********************************************************************/ -void remove_lock_holder_from_vdo_packer(struct vdo_completion *completion) -{ - struct data_vio *data_vio = as_data_vio(completion); - struct data_vio *lock_holder; - assert_data_vio_in_packer_zone(data_vio); - - lock_holder = data_vio->compression.lock_holder; - data_vio->compression.lock_holder = NULL; - remove_from_vdo_packer(lock_holder); -} - -/**********************************************************************/ -void increment_vdo_packer_flush_generation(struct packer *packer) +/** + * vdo_increment_packer_flush_generation() - Increment the flush generation + * in the packer. + * @packer: The packer. + * + * This will also cause the packer to flush so that any VIOs from previous + * generations will exit the packer. + */ +void vdo_increment_packer_flush_generation(struct packer *packer) { assert_on_packer_thread(packer, __func__); packer->flush_generation++; - flush_vdo_packer(packer); + vdo_flush_packer(packer); } /** - * Initiate a drain. + * initiate_drain() - Initiate a drain. * * Implements vdo_admin_initiator. - **/ + */ static void initiate_drain(struct admin_state *state) { struct packer *packer = container_of(state, struct packer, state); + write_all_non_empty_bins(packer); - vdo_check_for_drain_complete(packer); } -/**********************************************************************/ -void drain_vdo_packer(struct packer *packer, struct vdo_completion *completion) +/** + * vdo_drain_packer() - Drain the packer by preventing any more VIOs from + * entering the packer and then flushing. + * @packer: The packer to drain. + * @completion: The completion to finish when the packer has drained. + */ +void vdo_drain_packer(struct packer *packer, struct vdo_completion *completion) { assert_on_packer_thread(packer, __func__); - start_vdo_draining(&packer->state, VDO_ADMIN_STATE_SUSPENDING, - completion, initiate_drain); + vdo_start_draining(&packer->state, + VDO_ADMIN_STATE_SUSPENDING, + completion, + initiate_drain); } -/**********************************************************************/ -void resume_vdo_packer(struct packer *packer, struct vdo_completion *parent) +/** + * vdo_resume_packer() - Resume a packer which has been suspended. + * @packer: The packer to resume. + * @parent: The completion to finish when the packer has resumed. + */ +void vdo_resume_packer(struct packer *packer, struct vdo_completion *parent) { assert_on_packer_thread(packer, __func__); - finish_vdo_completion(parent, resume_vdo_if_quiescent(&packer->state)); + vdo_finish_completion(parent, vdo_resume_if_quiescent(&packer->state)); } -/**********************************************************************/ -static void dump_input_bin(const struct input_bin *bin, bool canceled) +static void dump_packer_bin(const struct packer_bin *bin, bool canceled) { if (bin->slots_used == 0) { - // Don't dump empty input bins. + /* Don't dump empty bins. */ return; } uds_log_info(" %sBin slots_used=%u free_space=%zu", - (canceled ? "Canceled" : "Input"), bin->slots_used, + (canceled ? "Canceled" : ""), bin->slots_used, bin->free_space); - // XXX dump vios in bin->incoming? The vios should have been dumped - // from the vio pool. Maybe just dump their addresses so it's clear - // they're here? -} - -/**********************************************************************/ -static void dump_output_bin(const struct output_bin *bin) -{ - size_t count = count_waiters(&bin->outgoing); - if (bin->slots_used == 0) { - // Don't dump empty output bins. - return; - } - - uds_log_info(" struct output_bin contains %zu outgoing waiters", - count); - - // XXX dump vios in bin->outgoing? The vios should have been dumped - // from the vio pool. Maybe just dump their addresses so it's clear - // they're here? - - // XXX dump writer vio? + /* + * FIXME: dump vios in bin->incoming? The vios should have been dumped + * from the vio pool. Maybe just dump their addresses so it's clear + * they're here? + */ } -/**********************************************************************/ -void dump_vdo_packer(const struct packer *packer) +/** + * vdo_dump_packer() - Dump the packer. + * @packer: The packer. + * + * Context: dumps in a thread-unsafe fashion. + */ +void vdo_dump_packer(const struct packer *packer) { - struct input_bin *input; - struct output_bin *output; + struct packer_bin *bin; uds_log_info("packer"); - uds_log_info(" flushGeneration=%llu state %s writing_batches=%s", + uds_log_info(" flushGeneration=%llu state %s packer_bin_count=%llu", (unsigned long long) packer->flush_generation, - get_vdo_admin_state_name(&packer->state), - uds_bool_to_string(packer->writing_batches)); - - uds_log_info(" input_bin_count=%llu", + vdo_get_admin_state_code(&packer->state)->name, (unsigned long long) packer->size); - for (input = get_vdo_packer_fullest_bin(packer); input != NULL; - input = next_vdo_packer_bin(packer, input)) { - dump_input_bin(input, false); + for (bin = vdo_get_packer_fullest_bin(packer); + bin != NULL; + bin = vdo_next_packer_bin(packer, bin)) { + dump_packer_bin(bin, false); } - dump_input_bin(packer->canceled_bin, true); - - uds_log_info(" output_bin_count=%zu idle_output_bin_count=%zu", - packer->output_bin_count, packer->idle_output_bin_count); - list_for_each_entry(output, &packer->output_bins, list) { - dump_output_bin(output); - } + dump_packer_bin(packer->canceled_bin, true); } diff --git a/vdo/packer.h b/vdo/packer.h index 583d0c3b..1c3932f4 100644 --- a/vdo/packer.h +++ b/vdo/packer.h @@ -1,150 +1,113 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/packer.h#13 $ */ #ifndef PACKER_H #define PACKER_H -#include "completion.h" +#include + +#include "compiler.h" + +#include "admin-state.h" +#include "block-mapping-state.h" #include "statistics.h" -#include "threadConfig.h" #include "types.h" +#include "wait-queue.h" enum { - DEFAULT_PACKER_INPUT_BINS = 16, - DEFAULT_PACKER_OUTPUT_BINS = 256, + DEFAULT_PACKER_BINS = 16, +}; + +/* + * Each packer_bin holds an incomplete batch of data_vios that only partially + * fill a compressed block. The bins are kept in a ring sorted by the amount of + * unused space so the first bin with enough space to hold a newly-compressed + * data_vio can easily be found. When the bin fills up or is flushed, the first + * uncanceled data_vio in the bin is selected to be the agent for that + * bin. Upon entering the packer, each data_vio already has its compressed data + * in the first slot of the data_vio's compressed_block (overlaid on the + * data_vio's scratch_block). So the agent's fragment is already in place. The + * fragments for the other uncanceled data_vios in the bin are packed into the + * agent's compressed block. The agent then writes out the compressed block. + * If the write is successful, the agent shares its pbn lock which each of the + * other data_vios in its compressed block and sends each on its way. Finally + * the agent itself continues on the write path as before. + * + * There is one special bin which is used to hold data_vios which have been + * canceled and removed from their bin by the packer. These data_vios need to + * wait for the canceller to rendezvous with them (VDO-2809) and so they sit in + * this special bin. + */ +struct packer_bin { + /* List links for packer.packer_bins */ + struct list_head list; + /* The number of items in the bin */ + slot_number_t slots_used; + /* + * The number of compressed block bytes remaining in the current batch + */ + size_t free_space; + /* The current partial batch of data_vios, waiting for more */ + struct data_vio *incoming[]; }; -struct packer; +struct packer { + /* The ID of the packer's callback thread */ + thread_id_t thread_id; + /* The number of bins */ + block_count_t size; + /* The block size minus header size */ + size_t bin_data_size; + /* The number of compression slots */ + size_t max_slots; + /* A list of all packer_bins, kept sorted by free_space */ + struct list_head bins; + /* + * A bin to hold data_vios which were canceled out of the packer and + * are waiting to rendezvous with the canceling data_vio. + */ + struct packer_bin *canceled_bin; + + /* The current flush generation */ + sequence_number_t flush_generation; + + /* The administrative state of the packer */ + struct admin_state state; + + /* + * Statistics are only updated on the packer thread, but are + * accessed from other threads. + */ + struct packer_statistics statistics; +}; -/** - * Make a new block packer. - * - * @param [in] vdo The vdo to which this packer belongs - * @param [in] input_bin_count The number of partial bins to keep in memory - * @param [in] output_bin_count The number of compressed blocks that can be - * written concurrently - * @param [out] packer_ptr A pointer to hold the new packer - * - * @return VDO_SUCCESS or an error - **/ -int __must_check make_vdo_packer(struct vdo *vdo, - block_count_t input_bin_count, - block_count_t output_bin_count, +int __must_check vdo_make_packer(struct vdo *vdo, + block_count_t bin_count, struct packer **packer_ptr); -/** - * Free a block packer. - * - * @param packer The packer to free - **/ -void free_vdo_packer(struct packer *packer); +void vdo_free_packer(struct packer *packer); -/** - * Check whether the compressed data in a data_vio will fit in a packer bin. - * - * @param data_vio The data_vio - * - * @return true if the data_vio will fit in a bin - **/ bool __must_check vdo_data_is_sufficiently_compressible(struct data_vio *data_vio); -/** - * Get the thread ID of the packer's zone. - * - * @param packer The packer - * - * @return The packer's thread ID - **/ -thread_id_t get_vdo_packer_thread_id(struct packer *packer); - -/** - * Get the current statistics from the packer. - * - * @param packer The packer to query - * - * @return a copy of the current statistics for the packer - **/ struct packer_statistics __must_check -get_vdo_packer_statistics(const struct packer *packer); +vdo_get_packer_statistics(const struct packer *packer); -/** - * Attempt to rewrite the data in this data_vio as part of a compressed block. - * - * @param data_vio The data_vio to pack - **/ void vdo_attempt_packing(struct data_vio *data_vio); -/** - * Request that the packer flush asynchronously. All bins with at least two - * compressed data blocks will be written out, and any solitary pending VIOs - * will be released from the packer. While flushing is in progress, any VIOs - * submitted to vdo_attempt_packing() will be continued immediately without - * attempting to pack them. - * - * @param packer The packer to flush - **/ -void flush_vdo_packer(struct packer *packer); +void vdo_flush_packer(struct packer *packer); -/** - * Remove a lock holder from the packer. - * - * @param completion The data_vio which needs a lock held by a data_vio in the - * packer. The data_vio's compression.lock_holder field will - * point to the data_vio to remove. - **/ -void remove_lock_holder_from_vdo_packer(struct vdo_completion *completion); - -/** - * Increment the flush generation in the packer. This will also cause the - * packer to flush so that any VIOs from previous generations will exit the - * packer. - * - * @param packer The packer - **/ -void increment_vdo_packer_flush_generation(struct packer *packer); +void vdo_remove_lock_holder_from_packer(struct vdo_completion *completion); + +void vdo_increment_packer_flush_generation(struct packer *packer); -/** - * Drain the packer by preventing any more VIOs from entering the packer and - * then flushing. - * - * @param packer The packer to drain - * @param completion The completion to finish when the packer has drained - **/ void -drain_vdo_packer(struct packer *packer, struct vdo_completion *completion); +vdo_drain_packer(struct packer *packer, struct vdo_completion *completion); -/** - * Resume a packer which has been suspended. - * - * @param packer The packer to resume - * @param parent The completion to finish when the packer has resumed - **/ -void resume_vdo_packer(struct packer *packer, struct vdo_completion *parent); +void vdo_resume_packer(struct packer *packer, struct vdo_completion *parent); -/** - * Dump the packer, in a thread-unsafe fashion. - * - * @param packer The packer - **/ -void dump_vdo_packer(const struct packer *packer); +void vdo_dump_packer(const struct packer *packer); #endif /* PACKER_H */ diff --git a/vdo/packerInternals.h b/vdo/packerInternals.h deleted file mode 100644 index 2abda247..00000000 --- a/vdo/packerInternals.h +++ /dev/null @@ -1,140 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/packerInternals.h#5 $ - */ - -#ifndef PACKER_INTERNALS_H -#define PACKER_INTERNALS_H - -#include - -#include "packer.h" - -#include "adminState.h" -#include "compressedBlock.h" -#include "header.h" -#include "statistics.h" -#include "types.h" -#include "waitQueue.h" - -/** - * Each input_bin holds an incomplete batch of data_vios that only partially - * fill a compressed block. The input bins are kept in a ring sorted by the - * amount of unused space so the first bin with enough space to hold a - * newly-compressed data_vio can easily be found. When the bin fills up or is - * flushed, the incoming data_vios are moved to the packer's batched_data_vios - * queue, from which they will eventually be routed to an idle output_bin. - * - * There is one special input bin which is used to hold data_vios which have - * been canceled and removed from their input bin by the packer. These - * data_vios need to wait for the canceller to rendezvous with them (VDO-2809) - * and so they sit in this special bin. - **/ -struct input_bin { - /** List links for packer.input_bins */ - struct list_head list; - /** The number of items in the bin */ - slot_number_t slots_used; - /** - * The number of compressed block bytes remaining in the current batch - */ - size_t free_space; - /** The current partial batch of data_vios, waiting for more */ - struct data_vio *incoming[]; -}; - -/** - * Each output_bin allows a single compressed block to be packed and written. - * When it is not idle, it holds a batch of data_vios that have been packed - * into the compressed block, written asynchronously, and are waiting for the - * write to complete. - **/ -struct output_bin { - /** List links for packer.output_bins */ - struct list_head list; - /** The storage for encoding the compressed block representation */ - struct compressed_block *block; - /** - * The struct allocating_vio wrapping the compressed block for writing - */ - struct allocating_vio *writer; - /** The number of compression slots used in the compressed block */ - slot_number_t slots_used; - /** The data_vios packed into the block, waiting for the write to - * complete */ - struct wait_queue outgoing; -}; - -/** - * A counted array holding a batch of data_vios that should be packed into an - * output bin. - **/ -struct output_batch { - size_t slots_used; - struct data_vio *slots[VDO_MAX_COMPRESSION_SLOTS]; -}; - -struct packer { - /** The ID of the packer's callback thread */ - thread_id_t thread_id; - /** The selector determining which physical zone to allocate from */ - struct allocation_selector *selector; - /** The number of input bins */ - block_count_t size; - /** The block size minus header size */ - size_t bin_data_size; - /** The number of compression slots */ - size_t max_slots; - /** A list of all input_bins, kept sorted by free_space */ - struct list_head input_bins; - /** A list of all output_bins */ - struct list_head output_bins; - /** - * A bin to hold data_vios which were canceled out of the packer and - * are waiting to rendezvous with the canceling data_vio. - **/ - struct input_bin *canceled_bin; - - /** The current flush generation */ - sequence_number_t flush_generation; - - /** The administrative state of the packer */ - struct admin_state state; - /** True when writing batched data_vios */ - bool writing_batches; - - /** - * Statistics are only updated on the packer thread, but are - * accessed from other threads. - **/ - struct packer_statistics statistics; - - /** Queue of batched data_vios waiting to be packed */ - struct wait_queue batched_data_vios; - - /** The total number of output bins allocated */ - size_t output_bin_count; - /** The number of idle output bins on the stack */ - size_t idle_output_bin_count; - /** The stack of idle output bins (0 = bottom) */ - struct output_bin *idle_output_bins[]; -}; - - -#endif /* PACKER_INTERNALS_H */ diff --git a/uds/pageCache.c b/vdo/page-cache.c similarity index 69% rename from uds/pageCache.c rename to vdo/page-cache.c index 6489ab9e..0d9d4c7d 100644 --- a/uds/pageCache.c +++ b/vdo/page-cache.c @@ -1,44 +1,25 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/krusty/src/uds/pageCache.c#35 $ */ -#include "pageCache.h" +#include "page-cache.h" #include -#include "cacheCounters.h" -#include "chapterIndex.h" +#include "chapter-index.h" #include "compiler.h" +#include "config.h" #include "errors.h" #include "geometry.h" -#include "hashUtils.h" -#include "indexConfig.h" +#include "hash-utils.h" #include "logger.h" -#include "memoryAlloc.h" +#include "memory-alloc.h" #include "permassert.h" -#include "recordPage.h" -#include "stringUtils.h" +#include "record-page.h" +#include "string-utils.h" #include "uds-threads.h" -#include "zone.h" -/**********************************************************************/ int assert_page_in_cache(struct page_cache *cache, struct cached_page *page) { uint16_t page_index; @@ -87,10 +68,10 @@ static void clear_cache_page(struct page_cache *cache, * * @return UDS_SUCCESS or an error code **/ -static int __must_check get_page_no_stats(struct page_cache *cache, - unsigned int physical_page, - int *queue_index, - struct cached_page **page_ptr) +static int __must_check get_page_and_index(struct page_cache *cache, + unsigned int physical_page, + int *queue_index, + struct cached_page **page_ptr) { uint16_t index_value, index; bool queued; @@ -132,9 +113,8 @@ static int __must_check get_page_no_stats(struct page_cache *cache, } else { *page_ptr = NULL; } - if (queue_index != NULL) { - *queue_index = queued ? index : -1; - } + + *queue_index = queued ? index : -1; return UDS_SUCCESS; } @@ -165,8 +145,10 @@ static void wait_for_pending_searches(struct page_cache *cache, if (search_pending(initial_counters[i]) && (page_being_searched(initial_counters[i]) == physical_page)) { - // There is an active search using the physical page. - // We need to wait for the search to finish. + /* + * There is an active search using the physical page. + * We need to wait for the search to finish. + */ while (initial_counters[i] == get_invalidate_counter(cache, i)) { uds_yield_scheduler(); @@ -178,39 +160,26 @@ static void wait_for_pending_searches(struct page_cache *cache, /** * Invalidate a cache page * - * @param cache the cache - * @param page the cached page - * @param reason the reason for invalidation, for stats + * @param cache the cache + * @param page the cached page * * @return UDS_SUCCESS or an error code **/ -static int __must_check -invalidate_page_in_cache(struct page_cache *cache, - struct cached_page *page, - enum invalidation_reason reason) +static int __must_check invalidate_page_in_cache(struct page_cache *cache, + struct cached_page *page) { - // We hold the readThreadsMutex. + int result; + + /* We hold the readThreadsMutex. */ if (page == NULL) { return UDS_SUCCESS; } if (page->cp_physical_page != cache->num_index_entries) { - switch (reason) { - case INVALIDATION_EVICT: - cache->counters.evictions++; - break; - case INVALIDATION_EXPIRE: - cache->counters.expirations++; - break; - default: - break; - } + result = assert_page_in_cache(cache, page); - if (reason != INVALIDATION_ERROR) { - int result = assert_page_in_cache(cache, page); - if (result != UDS_SUCCESS) { - return result; - } + if (result != UDS_SUCCESS) { + return result; } WRITE_ONCE(cache->index[page->cp_physical_page], @@ -223,26 +192,21 @@ invalidate_page_in_cache(struct page_cache *cache, return UDS_SUCCESS; } -/**********************************************************************/ +static int find_invalidate_and_make_least_recent(struct page_cache *cache, unsigned int physical_page, - struct queued_read *read_queue, - enum invalidation_reason reason, bool must_find) { struct cached_page *page; - int queued_index = -1; + int queue_index = -1; int result; - // We hold the readThreadsMutex. + /* We hold the readThreadsMutex. */ if (cache == NULL) { return UDS_SUCCESS; } - result = get_page_no_stats(cache, - physical_page, - ((read_queue != NULL) ? &queued_index : NULL), - &page); + result = get_page_and_index(cache, physical_page, &queue_index, &page); if (result != UDS_SUCCESS) { return result; } @@ -253,44 +217,44 @@ int find_invalidate_and_make_least_recent(struct page_cache *cache, return result; } - if (queued_index > -1) { + if (queue_index > -1) { uds_log_debug("setting pending read to invalid"); - read_queue[queued_index].invalid = true; + cache->read_queue[queue_index].invalid = true; } return UDS_SUCCESS; } - // Invalidate the page and unmap it from the cache. - result = invalidate_page_in_cache(cache, page, reason); + /* Invalidate the page and unmap it from the cache. */ + result = invalidate_page_in_cache(cache, page); if (result != UDS_SUCCESS) { return result; } - // Move the cached page to the least recently used end of the list - // so it will be replaced before any page with valid data. + /* + * Move the cached page to the least recently used end of the list + * so it will be replaced before any page with valid data. + */ WRITE_ONCE(page->cp_last_used, 0); return UDS_SUCCESS; } -/**********************************************************************/ static int __must_check initialize_page_cache(struct page_cache *cache, const struct geometry *geometry, unsigned int chapters_in_cache, - unsigned int read_queue_max_size, unsigned int zone_count) { int result; unsigned int i; + cache->geometry = geometry; cache->num_index_entries = geometry->pages_per_volume + 1; cache->num_cache_entries = chapters_in_cache * geometry->record_pages_per_chapter; - cache->read_queue_max_size = read_queue_max_size; cache->zone_count = zone_count; atomic64_set(&cache->clock, 1); - result = UDS_ALLOCATE(read_queue_max_size, + result = UDS_ALLOCATE(VOLUME_CACHE_MAX_QUEUED_READS, struct queued_read, "volume read queue", &cache->read_queue); @@ -322,7 +286,7 @@ static int __must_check initialize_page_cache(struct page_cache *cache, return result; } - // Initialize index values to invalid values. + /* Initialize index values to invalid values. */ for (i = 0; i < cache->num_index_entries; i++) { cache->index[i] = cache->num_cache_entries; } @@ -337,7 +301,9 @@ static int __must_check initialize_page_cache(struct page_cache *cache, for (i = 0; i < cache->num_cache_entries; i++) { struct cached_page *page = &cache->cache[i]; - result = initialize_volume_page(geometry, &page->cp_page_data); + + result = initialize_volume_page(geometry->bytes_per_page, + &page->cp_page_data); if (result != UDS_SUCCESS) { return result; } @@ -347,10 +313,8 @@ static int __must_check initialize_page_cache(struct page_cache *cache, return UDS_SUCCESS; } -/**********************************************************************/ int make_page_cache(const struct geometry *geometry, unsigned int chapters_in_cache, - unsigned int read_queue_max_size, unsigned int zone_count, struct page_cache **cache_ptr) { @@ -361,10 +325,7 @@ int make_page_cache(const struct geometry *geometry, return uds_log_warning_strerror(UDS_BAD_STATE, "cache size must be at least one chapter"); } - if (read_queue_max_size <= 0) { - return uds_log_warning_strerror(UDS_INVALID_ARGUMENT, - "read queue max size must be greater than 0"); - } + if (zone_count < 1) { return uds_log_warning_strerror(UDS_INVALID_ARGUMENT, "cache must have at least one zone"); @@ -378,7 +339,6 @@ int make_page_cache(const struct geometry *geometry, result = initialize_page_cache(cache, geometry, chapters_in_cache, - read_queue_max_size, zone_count); if (result != UDS_SUCCESS) { free_page_cache(cache); @@ -389,7 +349,6 @@ int make_page_cache(const struct geometry *geometry, return UDS_SUCCESS; } -/**********************************************************************/ void free_page_cache(struct page_cache *cache) { if (cache == NULL) { @@ -397,6 +356,7 @@ void free_page_cache(struct page_cache *cache) } if (cache->cache != NULL) { unsigned int i; + for (i = 0; i < cache->num_cache_entries; i++) { destroy_volume_page(&cache->cache[i].cp_page_data); } @@ -408,30 +368,29 @@ void free_page_cache(struct page_cache *cache) UDS_FREE(cache); } -/**********************************************************************/ void invalidate_page_cache(struct page_cache *cache) { unsigned int i; + for (i = 0; i < cache->num_index_entries; i++) { cache->index[i] = cache->num_cache_entries; } for (i = 0; i < cache->num_cache_entries; i++) { struct cached_page *page = &cache->cache[i]; + release_volume_page(&page->cp_page_data); clear_cache_page(cache, page); } } -/**********************************************************************/ int invalidate_page_cache_for_chapter(struct page_cache *cache, unsigned int chapter, - unsigned int pages_per_chapter, - enum invalidation_reason reason) + unsigned int pages_per_chapter) { int result; unsigned int i; - // We hold the readThreadsMutex. + /* We hold the readThreadsMutex. */ if ((cache == NULL) || (cache->cache == NULL)) { return UDS_SUCCESS; } @@ -439,12 +398,9 @@ int invalidate_page_cache_for_chapter(struct page_cache *cache, for (i = 0; i < pages_per_chapter; i++) { unsigned int physical_page = 1 + (pages_per_chapter * chapter) + i; - result = - find_invalidate_and_make_least_recent(cache, - physical_page, - cache->read_queue, - reason, - false); + result = find_invalidate_and_make_least_recent(cache, + physical_page, + false); if (result != UDS_SUCCESS) { return result; } @@ -453,12 +409,13 @@ int invalidate_page_cache_for_chapter(struct page_cache *cache, return UDS_SUCCESS; } -/**********************************************************************/ void make_page_most_recent(struct page_cache *cache, struct cached_page *page) { - // ASSERTION: We are either a zone thread holding a - // search_pending_counter, or we are any thread holding the - // readThreadsMutex. + /* + * ASSERTION: We are either a zone thread holding a + * search_pending_counter, or we are any thread holding the + */ + /* readThreadsMutex. */ if (atomic64_read(&cache->clock) != READ_ONCE(page->cp_last_used)) { WRITE_ONCE(page->cp_last_used, atomic64_inc_return(&cache->clock)); @@ -477,15 +434,18 @@ void make_page_most_recent(struct page_cache *cache, struct cached_page *page) static int __must_check get_least_recent_page(struct page_cache *cache, struct cached_page **page_ptr) { - // We hold the readThreadsMutex. + /* We hold the readThreadsMutex. */ int oldest_index = 0; - // Our first candidate is any page that does have a pending read. We - // ensure above that there are more entries than read threads, so there - // must be one. + /* + * Our first candidate is any page that does have a pending read. We + * ensure above that there are more entries than read threads, so there + * must be one. + */ unsigned int i; + for (i = 0;; i++) { if (i >= cache->num_cache_entries) { - // This should never happen. + /* This should never happen. */ return ASSERT(false, "oldest page is not NULL"); } if (!cache->cache[i].cp_read_pending) { @@ -493,8 +453,10 @@ static int __must_check get_least_recent_page(struct page_cache *cache, break; } } - // Now find the least recently used page that does not have a pending - // read. + /* + * Now find the least recently used page that does not have a pending + * read. + */ for (i = 0; i < cache->num_cache_entries; i++) { if (!cache->cache[i].cp_read_pending && (READ_ONCE(cache->cache[i].cp_last_used) <= @@ -506,53 +468,30 @@ static int __must_check get_least_recent_page(struct page_cache *cache, return UDS_SUCCESS; } -/**********************************************************************/ int get_page_from_cache(struct page_cache *cache, unsigned int physical_page, - int probe_type, - struct cached_page **page_ptr) + struct cached_page **page) { - // ASSERTION: We are in a zone thread. - // ASSERTION: We holding a search_pending_counter or the - // readThreadsMutex. - enum cache_result_kind cache_result; - struct cached_page *page; - int result, queue_index = -1; - - if (cache == NULL) { - return uds_log_warning_strerror(UDS_BAD_STATE, - "cannot get page with NULL cache"); - } - - // Get the cache page from the index - result = get_page_no_stats(cache, physical_page, &queue_index, &page); - if (result != UDS_SUCCESS) { - return result; - } - - cache_result = ((page != NULL) ? CACHE_RESULT_HIT : - ((queue_index != -1) ? - CACHE_RESULT_QUEUED : - CACHE_RESULT_MISS)); - increment_cache_counter(&cache->counters, probe_type, cache_result); + /* + * ASSERTION: We are in a zone thread. + * ASSERTION: We holding a search_pending_counter or the + * readThreadsMutex. + */ + int queue_index = -1; - if (page_ptr != NULL) { - *page_ptr = page; - } - return UDS_SUCCESS; + return get_page_and_index(cache, physical_page, &queue_index, page); } -/**********************************************************************/ int enqueue_read(struct page_cache *cache, struct uds_request *request, unsigned int physical_page) { int result; - // We hold the readThreadsMutex. + /* We hold the readThreadsMutex. */ uint16_t first = cache->read_queue_first; uint16_t last = cache->read_queue_last; - uint16_t next = (last + 1) % cache->read_queue_max_size; + uint16_t next = next_read_queue_position(last); uint16_t read_queue_pos; if ((cache->index[physical_page] & VOLUME_CACHE_QUEUED_FLAG) == 0) { @@ -580,7 +519,7 @@ int enqueue_read(struct page_cache *cache, cache->index[physical_page] & ~VOLUME_CACHE_QUEUED_FLAG; } - result = ASSERT((read_queue_pos < cache->read_queue_max_size), + result = ASSERT((read_queue_pos < VOLUME_CACHE_MAX_QUEUED_READS), "queue is not overfull"); if (result != UDS_SUCCESS) { return result; @@ -596,20 +535,19 @@ int enqueue_read(struct page_cache *cache, return UDS_QUEUED; } -/**********************************************************************/ bool reserve_read_queue_entry(struct page_cache *cache, unsigned int *queue_pos, struct uds_request **first_request, unsigned int *physical_page, bool *invalid) { - // We hold the readThreadsMutex. + /* We hold the readThreadsMutex. */ uint16_t last_read = cache->read_queue_last_read; unsigned int page_no; uint16_t index_value; bool is_invalid, queued; - // No items to dequeue + /* No items to dequeue */ if (last_read == cache->read_queue_last) { return false; } @@ -620,15 +558,19 @@ bool reserve_read_queue_entry(struct page_cache *cache, index_value = cache->index[page_no]; queued = (index_value & VOLUME_CACHE_QUEUED_FLAG) != 0; - // ALB-1429 ... need to check to see if its still queued before - // resetting + /* + * ALB-1429 ... need to check to see if its still queued before + * resetting + */ if (is_invalid && queued) { - // invalidate cache index slot + /* invalidate cache index slot */ WRITE_ONCE(cache->index[page_no], cache->num_cache_entries); } - // If a sync read has taken this page, set invalid to true so we don't - // overwrite, we simply just requeue requests. + /* + * If a sync read has taken this page, set invalid to true so we don't + * overwrite, we simply just requeue requests. + */ if (!queued) { is_invalid = true; } @@ -639,35 +581,32 @@ bool reserve_read_queue_entry(struct page_cache *cache, *first_request = cache->read_queue[last_read].request_list.first; *physical_page = page_no; *invalid = is_invalid; - cache->read_queue_last_read = - (last_read + 1) % cache->read_queue_max_size; + cache->read_queue_last_read = next_read_queue_position(last_read); return true; } -/**********************************************************************/ void release_read_queue_entry(struct page_cache *cache, unsigned int queue_pos) { - // We hold the readThreadsMutex. + /* We hold the readThreadsMutex. */ uint16_t last_read = cache->read_queue_last_read; cache->read_queue[queue_pos].reserved = false; - // Move the read_queue_first pointer along when we can + /* Move the read_queue_first pointer along when we can */ while ((cache->read_queue_first != last_read) && (!cache->read_queue[cache->read_queue_first].reserved)) { - cache->read_queue_first = (cache->read_queue_first + 1) % - cache->read_queue_max_size; + cache->read_queue_first = + next_read_queue_position(cache->read_queue_first); } } -/**********************************************************************/ int select_victim_in_cache(struct page_cache *cache, struct cached_page **page_ptr) { struct cached_page *page = NULL; int result; - // We hold the readThreadsMutex. + /* We hold the readThreadsMutex. */ if (cache == NULL) { return uds_log_warning_strerror(UDS_BAD_STATE, "cannot put page in NULL cache"); @@ -683,10 +622,11 @@ int select_victim_in_cache(struct page_cache *cache, return result; } - // If the page is currently being pointed to by the page map, clear - // it from the page map, and update cache stats + /* + * If the page is currently being pointed to by the page map, clear + * it from the page map. + */ if (page->cp_physical_page != cache->num_index_entries) { - cache->counters.evictions++; WRITE_ONCE(cache->index[page->cp_physical_page], cache->num_cache_entries); wait_for_pending_searches(cache, page->cp_physical_page); @@ -699,7 +639,6 @@ int select_victim_in_cache(struct page_cache *cache, return UDS_SUCCESS; } -/**********************************************************************/ int put_page_in_cache(struct page_cache *cache, unsigned int physical_page, struct cached_page *page) @@ -707,7 +646,7 @@ int put_page_in_cache(struct page_cache *cache, uint16_t value; int result; - // We hold the readThreadsMutex. + /* We hold the readThreadsMutex. */ if (cache == NULL) { return uds_log_warning_strerror(UDS_BAD_STATE, "cannot complete page in NULL cache"); @@ -728,7 +667,7 @@ int put_page_in_cache(struct page_cache *cache, page->cp_physical_page = physical_page; - // Figure out the index into the cache array using pointer arithmetic + /* Figure out the index into the cache array using pointer arithmetic */ value = page - cache->cache; result = ASSERT((value < cache->num_cache_entries), "cache index is valid"); @@ -744,23 +683,22 @@ int put_page_in_cache(struct page_cache *cache, * We hold the readThreadsMutex, but we must have a write memory * barrier before making the cached_page available to the readers * that do not hold the mutex. The corresponding read memory - * barrier is in get_page_no_stats. + * barrier is in get_page_and_index(). */ smp_wmb(); - // Point the page map to the new page. Will clear queued flag + /* Point the page map to the new page. Will clear queued flag */ WRITE_ONCE(cache->index[physical_page], value); return UDS_SUCCESS; } -/**********************************************************************/ void cancel_page_in_cache(struct page_cache *cache, unsigned int physical_page, struct cached_page *page) { int result; - // We hold the readThreadsMutex. + /* We hold the readThreadsMutex. */ if (cache == NULL) { uds_log_warning("cannot cancel page in NULL cache"); return; @@ -780,11 +718,10 @@ void cancel_page_in_cache(struct page_cache *cache, clear_cache_page(cache, page); page->cp_read_pending = false; - // Clear the page map for the new page. Will clear queued flag + /* Clear the page map for the new page. Will clear queued flag */ WRITE_ONCE(cache->index[physical_page], cache->num_cache_entries); } -/**********************************************************************/ size_t get_page_cache_size(struct page_cache *cache) { if (cache == NULL) { @@ -792,4 +729,3 @@ size_t get_page_cache_size(struct page_cache *cache) } return sizeof(struct delta_index_page) * cache->num_cache_entries; } - diff --git a/uds/pageCache.h b/vdo/page-cache.h similarity index 75% rename from uds/pageCache.h rename to vdo/page-cache.h index 64070f75..6cb3a5c0 100644 --- a/uds/pageCache.h +++ b/vdo/page-cache.h @@ -1,22 +1,6 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/krusty/src/uds/pageCache.h#21 $ */ #ifndef PAGE_CACHE_H @@ -24,14 +8,12 @@ #include -#include "cacheCounters.h" -#include "chapterIndex.h" +#include "chapter-index.h" #include "common.h" #include "compiler.h" -#include "indexConfig.h" +#include "geometry.h" #include "permassert.h" -#include "request.h" -#include "volumeStore.h" +#include "volume-store.h" struct request_list { struct uds_request *first; @@ -54,7 +36,7 @@ struct cached_page { enum { VOLUME_CACHE_MAX_ENTRIES = (UINT16_MAX >> 1), VOLUME_CACHE_QUEUED_FLAG = (1 << 15), - VOLUME_CACHE_DEFAULT_MAX_QUEUED_READS = 4096 + VOLUME_CACHE_MAX_QUEUED_READS = 4096, }; struct queued_read { @@ -68,14 +50,6 @@ struct queued_read { struct request_list request_list; }; -// Reason for invalidating a cache entry, used for gathering statistics -enum invalidation_reason { - INVALIDATION_EVICT, // cache is full, goodbye - INVALIDATION_EXPIRE, // your chapter is being overwritten - INVALIDATION_ERROR, // error happened; don't try to use data - INVALIDATION_INIT_SHUTDOWN -}; - /* * Value stored atomically in a search_pending_counter. The low order * 32 bits is the physical page number of the cached page being read. @@ -88,38 +62,45 @@ enum invalidation_reason { * the value in the wait_for_pending_searches method. */ typedef int64_t invalidate_counter_t; -// Fields of invalidate_counter_t. -// These must be 64 bit, so an enum cannot be not used. -#define PAGE_FIELD ((long) UINT_MAX) // The page number field -#define COUNTER_LSB (PAGE_FIELD + 1L) // The LSB of the counter field +/* + * Fields of invalidate_counter_t. + * These must be 64 bit, so an enum cannot be not used. + */ +#define PAGE_FIELD ((long) UINT_MAX) /* The page number field */ +#define COUNTER_LSB (PAGE_FIELD + 1L) /* The LSB of the counter field */ struct __attribute__((aligned(CACHE_LINE_BYTES))) search_pending_counter { atomic64_t atomic_value; }; struct page_cache { - // Geometry governing the volume + /* Geometry governing the volume */ const struct geometry *geometry; - // The number of zones + /* The number of zones */ unsigned int zone_count; - // The number of index entries + /* The number of index entries */ unsigned int num_index_entries; - // The max number of cached entries + /* The max number of cached entries */ uint16_t num_cache_entries; - // The index used to quickly access page in cache - top bit is a - // 'queued' flag + /* + * The index used to quickly access page in cache - top bit is a + * 'queued' flag + */ uint16_t *index; - // The cache + /* The cache */ struct cached_page *cache; - // A counter for each zone to keep track of when a search is occurring - // within that zone. + /* + * A counter for each zone to keep track of when a search is occurring + * within that zone. + */ struct search_pending_counter *search_pending_counters; - // Queued reads, as a circular array, with first and last indexes + /* Queued reads, as a circular array, with first and last indexes */ struct queued_read *read_queue; - // Cache counters for stats. This is the first field of a - // page_cache that is not constant after the struct is - // initialized. - struct cache_counters counters; + /* + * All entries above this point are constant once the structure has + * been initialized. + */ + /** * Entries are enqueued at read_queue_last. * To 'reserve' entries, we get the entry pointed to by @@ -146,26 +127,22 @@ struct page_cache { uint16_t read_queue_first; uint16_t read_queue_last_read; uint16_t read_queue_last; - // The size of the read queue - unsigned int read_queue_max_size; - // Page access counter + /* Page access counter */ atomic64_t clock; }; /** * Allocate a cache for a volume. * - * @param geometry The geometry governing the volume - * @param chapters_in_cache The size (in chapters) of the page cache - * @param read_queue_max_size The maximum size of the read queue - * @param zone_count The number of zones in the index - * @param cache_ptr A pointer to hold the new page cache + * @param geometry The geometry governing the volume + * @param chapters_in_cache The size (in chapters) of the page cache + * @param zone_count The number of zones in the index + * @param cache_ptr A pointer to hold the new page cache * * @return UDS_SUCCESS or an error code **/ int __must_check make_page_cache(const struct geometry *geometry, unsigned int chapters_in_cache, - unsigned int read_queue_max_size, unsigned int zone_count, struct page_cache **cache_ptr); @@ -189,34 +166,13 @@ void invalidate_page_cache(struct page_cache *cache); * @param cache the page cache * @param chapter the chapter * @param pages_per_chapter the number of pages per chapter - * @param reason the reason for invalidation * * @return UDS_SUCCESS or an error code **/ int __must_check invalidate_page_cache_for_chapter(struct page_cache *cache, unsigned int chapter, - unsigned int pages_per_chapter, - enum invalidation_reason reason); - -/** - * Find a page, invalidate it, and make its memory the least recent. This - * method is only exposed for the use of unit tests. - * - * @param cache The cache containing the page - * @param physical_page The id of the page to invalidate - * @param read_queue The queue of pending reads (may be NULL) - * @param reason The reason for the invalidation, for stats - * @param must_find If true, it is an error if the page - * can't be found - * - * @return UDS_SUCCESS or an error code - **/ -int find_invalidate_and_make_least_recent(struct page_cache *cache, - unsigned int physical_page, - struct queued_read *read_queue, - enum invalidation_reason reason, - bool must_find); + unsigned int pages_per_chapter); /** * Make the page the most recent in the cache @@ -244,17 +200,13 @@ int __must_check assert_page_in_cache(struct page_cache *cache, * * @param [in] cache the page cache * @param [in] physical_page the page number - * @param [in] probe_type the type of cache access being done - * (cache_probe_type optionally OR'ed with - * CACHE_PROBE_IGNORE_FAILURE) - * @param [out] page_ptr the found page + * @param [out] page the found page * * @return UDS_SUCCESS or an error code **/ int __must_check get_page_from_cache(struct page_cache *cache, unsigned int physical_page, - int probe_type, - struct cached_page **page_ptr); + struct cached_page **page); /** * Enqueue a read request @@ -299,6 +251,18 @@ bool reserve_read_queue_entry(struct page_cache *cache, void release_read_queue_entry(struct page_cache *cache, unsigned int queue_pos); +/** + * Return the next read queue entry position after the given position. + * + * @param position The read queue entry position to increment + * + * @return the position of the next read queue entry + **/ +static INLINE uint16_t next_read_queue_position(uint16_t position) +{ + return (position + 1) % VOLUME_CACHE_MAX_QUEUED_READS; +} + /** * Check for the page cache read queue being full. * @@ -309,7 +273,7 @@ void release_read_queue_entry(struct page_cache *cache, static INLINE bool read_queue_is_full(struct page_cache *cache) { return (cache->read_queue_first == - (cache->read_queue_last + 1) % cache->read_queue_max_size); + next_read_queue_position(cache->read_queue_last)); } /** @@ -368,7 +332,6 @@ void cancel_page_in_cache(struct page_cache *cache, **/ size_t __must_check get_page_cache_size(struct page_cache *cache); - /** * Read the invalidate counter for the given zone. * @@ -469,9 +432,11 @@ static INLINE void end_pending_search(struct page_cache *cache, unsigned int zone_number) { invalidate_counter_t invalidate_counter; - // This memory barrier ensures that this thread completes reads of the - // cached page before other threads see the write to the invalidate - // counter. + /* + * This memory barrier ensures that this thread completes reads of the + * cached page before other threads see the write to the invalidate + * counter. + */ smp_mb(); invalidate_counter = get_invalidate_counter(cache, zone_number); diff --git a/vdo/partitionCopy.c b/vdo/partitionCopy.c deleted file mode 100644 index d812fa74..00000000 --- a/vdo/partitionCopy.c +++ /dev/null @@ -1,262 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/partitionCopy.c#18 $ - */ - -#include "partitionCopy.h" - -#include "memoryAlloc.h" -#include "permassert.h" - -#include "completion.h" -#include "constants.h" -#include "extent.h" -#include "numUtils.h" - -enum { - STRIDE_LENGTH = 2048 -}; - -/** - * A partition copy completion. - **/ -struct copy_completion { - /** completion header */ - struct vdo_completion completion; - /** the source partition to copy from */ - struct partition *source; - /** the target partition to copy to */ - struct partition *target; - /** the current in-partition PBN the copy is beginning at */ - physical_block_number_t current_index; - /** the last block to copy */ - physical_block_number_t ending_index; - /** the backing data used by the extent */ - char *data; - /** the extent being used to copy */ - struct vdo_extent *extent; -}; - -/** - * Convert a vdo_completion to a copy_completion. - * - * @param completion The completion to convert - * - * @return the completion as a copy_completion - **/ -static inline struct copy_completion * __must_check -as_copy_completion(struct vdo_completion *completion) -{ - assert_vdo_completion_type(completion->type, - VDO_PARTITION_COPY_COMPLETION); - return container_of(completion, struct copy_completion, completion); -} - -/** - * Free a copy completion. - * - * @param copy The copy completion to free - **/ -static void free_copy_completion(struct copy_completion *copy) -{ - free_vdo_extent(UDS_FORGET(copy->extent)); - UDS_FREE(copy->data); - UDS_FREE(copy); -} - -/**********************************************************************/ -int make_vdo_copy_completion(struct vdo *vdo, - struct vdo_completion **completion_ptr) -{ - struct copy_completion *copy; - int result = UDS_ALLOCATE(1, struct copy_completion, __func__, ©); - if (result != VDO_SUCCESS) { - return result; - } - - initialize_vdo_completion(©->completion, vdo, - VDO_PARTITION_COPY_COMPLETION); - - result = UDS_ALLOCATE((VDO_BLOCK_SIZE * STRIDE_LENGTH), - char, - "partition copy extent", - ©->data); - if (result != VDO_SUCCESS) { - free_copy_completion(UDS_FORGET(copy)); - return result; - } - - result = create_vdo_extent(vdo, - VIO_TYPE_PARTITION_COPY, - VIO_PRIORITY_HIGH, - STRIDE_LENGTH, - copy->data, - ©->extent); - if (result != VDO_SUCCESS) { - free_copy_completion(copy); - return result; - } - - *completion_ptr = ©->completion; - return VDO_SUCCESS; -} - -/**********************************************************************/ -void free_vdo_copy_completion(struct vdo_completion *completion) -{ - if (completion == NULL) { - return; - } - - free_copy_completion(as_copy_completion(UDS_FORGET(completion))); -} - -/**********************************************************************/ -static void copy_partition_stride(struct copy_completion *copy); - -/** - * Determine the number of blocks to copy in the current stride. - * - * @param copy The copy completion - * - * @return The number of blocks to copy in the current stride - **/ -static inline block_count_t get_stride_size(struct copy_completion *copy) -{ - return min((block_count_t) STRIDE_LENGTH, - copy->ending_index - copy->current_index); -} - -/** - * Process a completed write during a partition copy. - * - * @param completion The extent which has just completed writing - **/ -static void complete_write_for_copy(struct vdo_completion *completion) -{ - struct copy_completion *copy = as_copy_completion(completion->parent); - copy->current_index += get_stride_size(copy); - if (copy->current_index >= copy->ending_index) { - // We're done. - finish_vdo_completion(completion->parent, VDO_SUCCESS); - return; - } - copy_partition_stride(copy); -} - -/** - * Process a completed read during a partition copy, and launch the - * corresponding write to the new partition. - * - * @param completion The extent which has just completed reading - **/ -static void complete_read_for_copy(struct vdo_completion *completion) -{ - struct copy_completion *copy = as_copy_completion(completion->parent); - physical_block_number_t layer_start_block; - int result = vdo_translate_to_pbn(copy->target, copy->current_index, - &layer_start_block); - if (result != VDO_SUCCESS) { - finish_vdo_completion(completion->parent, result); - return; - } - - completion->callback = complete_write_for_copy; - write_partial_vdo_metadata_extent(vdo_completion_as_extent(completion), - layer_start_block, - get_stride_size(copy)); -} - -/** - * Copy a stride from one partition to the new partition. - * - * @param copy The copy_completion - **/ -static void copy_partition_stride(struct copy_completion *copy) -{ - physical_block_number_t layer_start_block; - int result = vdo_translate_to_pbn(copy->source, copy->current_index, - &layer_start_block); - if (result != VDO_SUCCESS) { - finish_vdo_completion(©->completion, result); - return; - } - - prepare_vdo_completion(©->extent->completion, - complete_read_for_copy, - finish_vdo_completion_parent_callback, - copy->completion.callback_thread_id, - ©->completion); - read_partial_vdo_metadata_extent(copy->extent, layer_start_block, - get_stride_size(copy)); -} - -/** - * Verify that the source can be copied to the target safely. - * - * @param source The source partition - * @param target The target partition - * - * @return VDO_SUCCESS or an error code - **/ -static int validate_partition_copy(struct partition *source, - struct partition *target) -{ - block_count_t source_size = get_vdo_fixed_layout_partition_size(source); - block_count_t target_size = get_vdo_fixed_layout_partition_size(target); - - physical_block_number_t source_start = - get_vdo_fixed_layout_partition_offset(source); - physical_block_number_t source_end = source_start + source_size; - physical_block_number_t target_start = - get_vdo_fixed_layout_partition_offset(target); - physical_block_number_t target_end = target_start + target_size; - - int result = ASSERT(source_size <= target_size, - "target partition must be not smaller than source partition"); - if (result != UDS_SUCCESS) { - return result; - } - - return ASSERT(((source_end <= target_start) || - (target_end <= source_start)), - "target partition must not overlap source partition"); -} - -/**********************************************************************/ -void copy_vdo_partition(struct vdo_completion *completion, - struct partition *source, - struct partition *target, - struct vdo_completion *parent) -{ - struct copy_completion *copy = as_copy_completion(completion); - - int result = validate_partition_copy(source, target); - if (result != VDO_SUCCESS) { - finish_vdo_completion(parent, result); - return; - } - - prepare_vdo_completion_to_finish_parent(©->completion, parent); - copy->source = source; - copy->target = target; - copy->current_index = 0; - copy->ending_index = get_vdo_fixed_layout_partition_size(source); - copy_partition_stride(copy); -} diff --git a/vdo/partitionCopy.h b/vdo/partitionCopy.h deleted file mode 100644 index be01733e..00000000 --- a/vdo/partitionCopy.h +++ /dev/null @@ -1,60 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/partitionCopy.h#9 $ - */ - -#ifndef PARTITION_COPY_H -#define PARTITION_COPY_H - -#include "fixedLayout.h" -#include "types.h" - -/** - * Make a copy completion. - * - * @param [in] vdo The VDO on which the partitions reside - * @param [out] completion_ptr A pointer to hold the copy completion - * - * @return VDO_SUCCESS or an error - **/ -int __must_check -make_vdo_copy_completion(struct vdo *vdo, - struct vdo_completion **completion_ptr); - -/** - * Free a copy completion. - * - * @param completion The completion to free - **/ -void free_vdo_copy_completion(struct vdo_completion *completion); - -/** - * Copy a partition. - * - * @param completion The copy completion to use - * @param source The partition to copy from - * @param target The partition to copy to - * @param parent The parent to finish when the copy is complete - **/ -void copy_vdo_partition(struct vdo_completion *completion, - struct partition *source, - struct partition *target, - struct vdo_completion *parent); - -#endif /* PARTITION_COPY_H */ diff --git a/vdo/pbn-lock-pool.c b/vdo/pbn-lock-pool.c new file mode 100644 index 00000000..e677f1ae --- /dev/null +++ b/vdo/pbn-lock-pool.c @@ -0,0 +1,165 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright Red Hat + */ + +#include "pbn-lock-pool.h" + +#include + +#include "logger.h" +#include "memory-alloc.h" +#include "permassert.h" + +#include "pbn-lock.h" +#include "status-codes.h" + +/** + * union idle_pbn_lock - PBN lock list entries. + * + * Unused (idle) PBN locks are kept in a list. Just like in a malloc + * implementation, the lock structure is unused memory, so we can save a bit + * of space (and not pollute the lock structure proper) by using a union to + * overlay the lock structure with the free list. + */ +typedef union { + /** @entry: Only used while locks are in the pool. */ + struct list_head entry; + /** @lock: Only used while locks are not in the pool. */ + struct pbn_lock lock; +} idle_pbn_lock; + +/** + * struct pbn_lock_pool - list of PBN locks. + * + * The lock pool is little more than the memory allocated for the locks. + */ +struct pbn_lock_pool { + /** @capacity: The number of locks allocated for the pool. */ + size_t capacity; + /** @borrowed: The number of locks currently borrowed from the pool. */ + size_t borrowed; + /** @idle_list: A list containing all idle PBN lock instances. */ + struct list_head idle_list; + /** @locks: The memory for all the locks allocated by this pool. */ + idle_pbn_lock locks[]; +}; + +/** + * vdo_make_pbn_lock_pool() - Create a new PBN lock pool and all the lock + * instances it can loan out. + * @capacity: The number of PBN locks to allocate for the pool. + * @pool_ptr: A pointer to receive the new pool. + * + * Return: VDO_SUCCESS or an error code. + */ +int vdo_make_pbn_lock_pool(size_t capacity, struct pbn_lock_pool **pool_ptr) +{ + size_t i; + struct pbn_lock_pool *pool; + int result = UDS_ALLOCATE_EXTENDED(struct pbn_lock_pool, capacity, + idle_pbn_lock, __func__, &pool); + if (result != VDO_SUCCESS) { + return result; + } + + pool->capacity = capacity; + pool->borrowed = capacity; + INIT_LIST_HEAD(&pool->idle_list); + + for (i = 0; i < capacity; i++) { + vdo_return_pbn_lock_to_pool(pool, &pool->locks[i].lock); + } + + *pool_ptr = pool; + return VDO_SUCCESS; +} + +/** + * vdo_free_pbn_lock_pool() - Free a PBN lock pool. + * @pool: The lock pool to free. + * + * This also frees all the PBN locks it allocated, so the caller must ensure + * that all locks have been returned to the pool. + */ +void vdo_free_pbn_lock_pool(struct pbn_lock_pool *pool) +{ + if (pool == NULL) { + return; + } + + ASSERT_LOG_ONLY(pool->borrowed == 0, + "All PBN locks must be returned to the pool before it is freed, but %zu locks are still on loan", + pool->borrowed); + UDS_FREE(pool); +} + +/** + * vdo_borrow_pbn_lock_from_pool() - Borrow a PBN lock from the pool and + * initialize it with the provided type. + * @pool: The pool from which to borrow. + * @type: The type with which to initialize the lock. + * @lock_ptr: A pointer to receive the borrowed lock. + * + * Pools do not grow on demand or allocate memory, so this will fail if the + * pool is empty. Borrowed locks are still associated with this pool and must + * be returned to only this pool. + * + * Return: VDO_SUCCESS, or VDO_LOCK_ERROR if the pool is empty. + */ +int vdo_borrow_pbn_lock_from_pool(struct pbn_lock_pool *pool, + enum pbn_lock_type type, + struct pbn_lock **lock_ptr) +{ + int result; + struct list_head *idle_entry; + idle_pbn_lock *idle; + + if (pool->borrowed >= pool->capacity) { + return uds_log_error_strerror(VDO_LOCK_ERROR, + "no free PBN locks left to borrow"); + } + pool->borrowed += 1; + + result = ASSERT(!list_empty(&pool->idle_list), + "idle list should not be empty if pool not at capacity"); + if (result != VDO_SUCCESS) { + return result; + } + + idle_entry = pool->idle_list.prev; + list_del(idle_entry); + memset(idle_entry, 0, sizeof(*idle_entry)); + + idle = list_entry(idle_entry, idle_pbn_lock, entry); + vdo_initialize_pbn_lock(&idle->lock, type); + + *lock_ptr = &idle->lock; + return VDO_SUCCESS; +} + +/** + * vdo_return_pbn_lock_to_pool() - Return to the pool a lock that was borrowed + * from it. + * @pool: The pool from which the lock was borrowed. + * @lock: The last reference to the lock being returned. + * + * It must be the last live reference, as if the memory were being freed (the + * lock memory will re-initialized or zeroed). + */ +void vdo_return_pbn_lock_to_pool(struct pbn_lock_pool *pool, + struct pbn_lock *lock) +{ + idle_pbn_lock *idle; + + /* A bit expensive, but will promptly catch some use-after-free errors. */ + memset(lock, 0, sizeof(*lock)); + + idle = container_of(lock, idle_pbn_lock, lock); + INIT_LIST_HEAD(&idle->entry); + list_add_tail(&idle->entry, &pool->idle_list); + + ASSERT_LOG_ONLY(pool->borrowed > 0, + "shouldn't return more than borrowed"); + pool->borrowed -= 1; +} diff --git a/vdo/pbn-lock-pool.h b/vdo/pbn-lock-pool.h new file mode 100644 index 00000000..bb2e5992 --- /dev/null +++ b/vdo/pbn-lock-pool.h @@ -0,0 +1,27 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright Red Hat + */ + +#ifndef PBN_LOCK_POOL_H +#define PBN_LOCK_POOL_H + +#include "pbn-lock.h" +#include "types.h" + +struct pbn_lock_pool; + +int __must_check +vdo_make_pbn_lock_pool(size_t capacity, struct pbn_lock_pool **pool_ptr); + +void vdo_free_pbn_lock_pool(struct pbn_lock_pool *pool); + +int __must_check +vdo_borrow_pbn_lock_from_pool(struct pbn_lock_pool *pool, + enum pbn_lock_type type, + struct pbn_lock **lock_ptr); + +void vdo_return_pbn_lock_to_pool(struct pbn_lock_pool *pool, + struct pbn_lock *lock); + +#endif /* PBN_LOCK_POOL_H */ diff --git a/vdo/pbn-lock.c b/vdo/pbn-lock.c new file mode 100644 index 00000000..36924c0c --- /dev/null +++ b/vdo/pbn-lock.c @@ -0,0 +1,174 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright Red Hat + */ + +#include "pbn-lock.h" + +#include "logger.h" +#include "permassert.h" + +#include "block-allocator.h" +#include "packed-reference-block.h" + +struct pbn_lock_implementation { + enum pbn_lock_type type; + const char *name; + const char *release_reason; +}; + +/* + * This array must have an entry for every pbn_lock_type value. + */ +static const struct pbn_lock_implementation LOCK_IMPLEMENTATIONS[] = { + [VIO_READ_LOCK] = { + .type = VIO_READ_LOCK, + .name = "read", + .release_reason = "candidate duplicate", + }, + [VIO_WRITE_LOCK] = { + .type = VIO_WRITE_LOCK, + .name = "write", + .release_reason = "newly allocated", + }, + [VIO_BLOCK_MAP_WRITE_LOCK] = { + .type = VIO_BLOCK_MAP_WRITE_LOCK, + .name = "block map write", + .release_reason = "block map write", + }, +}; + +static inline bool has_lock_type(const struct pbn_lock *lock, + enum pbn_lock_type type) +{ + return (lock->implementation == &LOCK_IMPLEMENTATIONS[type]); +} + +/** + * vdo_is_pbn_read_lock() - Check whether a pbn_lock is a read lock. + * @lock: The lock to check. + * + * Return: true if the lock is a read lock. + */ +bool vdo_is_pbn_read_lock(const struct pbn_lock *lock) +{ + return has_lock_type(lock, VIO_READ_LOCK); +} + +static inline void set_pbn_lock_type(struct pbn_lock *lock, + enum pbn_lock_type type) +{ + lock->implementation = &LOCK_IMPLEMENTATIONS[type]; +} + +/** + * vdo_initialize_pbn_lock() - Initialize a pbn_lock. + * @lock: The lock to initialize. + * @type: The type of the lock. + */ +void vdo_initialize_pbn_lock(struct pbn_lock *lock, enum pbn_lock_type type) +{ + lock->holder_count = 0; + set_pbn_lock_type(lock, type); +} + +/** + * vdo_downgrade_pbn_write_lock() - Downgrade a PBN write lock to a + * PBN read lock. + * @lock: The PBN write lock to downgrade. + * + * The lock holder count is cleared and the caller is responsible for + * setting the new count. + */ +void vdo_downgrade_pbn_write_lock(struct pbn_lock *lock, bool compressed_write) +{ + ASSERT_LOG_ONLY(!vdo_is_pbn_read_lock(lock), + "PBN lock must not already have been downgraded"); + ASSERT_LOG_ONLY(!has_lock_type(lock, VIO_BLOCK_MAP_WRITE_LOCK), + "must not downgrade block map write locks"); + ASSERT_LOG_ONLY(lock->holder_count == 1, + "PBN write lock should have one holder but has %u", + lock->holder_count); + /* + * data_vio write locks are downgraded in place--the writer + * retains the hold on the lock. If this was a compressed write, the + * holder has not yet journaled its own inc ref, otherwise, it has. + */ + lock->increment_limit = (compressed_write ? + MAXIMUM_REFERENCE_COUNT : + MAXIMUM_REFERENCE_COUNT - 1); + set_pbn_lock_type(lock, VIO_READ_LOCK); +} + +/** + * vdo_claim_pbn_lock_increment() - Try to claim one of the available + * reference count increments on a read lock. + * @lock: The PBN read lock from which to claim an increment. + * + * Claims may be attempted from any thread. A claim is only valid + * until the PBN lock is released. + * + * Return: true if the claim succeeded, guaranteeing one increment can + * be made without overflowing the PBN's reference count. + */ +bool vdo_claim_pbn_lock_increment(struct pbn_lock *lock) +{ + /* + * Claim the next free reference atomically since hash locks from + * multiple hash zone threads might be concurrently deduplicating + * against a single PBN lock on compressed block. As long as hitting + * the increment limit will lead to the PBN lock being released in a + * sane time-frame, we won't overflow a 32-bit claim counter, allowing + * a simple add instead of a compare-and-swap. + */ + uint32_t claim_number = + (uint32_t) atomic_add_return(1, &lock->increments_claimed); + return (claim_number <= lock->increment_limit); +} + +/** + * vdo_assign_pbn_lock_provisional_reference() - Inform a PBN lock that it is + * responsible for a provisional + * reference. + * @lock: The PBN lock. + */ +void vdo_assign_pbn_lock_provisional_reference(struct pbn_lock *lock) +{ + ASSERT_LOG_ONLY(!lock->has_provisional_reference, + "lock does not have a provisional reference"); + lock->has_provisional_reference = true; +} + +/** + * vdo_unassign_pbn_lock_provisional_reference() - Inform a PBN lock that it + * is no longer responsible + * for a provisional + * reference. + * @lock: The PBN lock. + */ +void vdo_unassign_pbn_lock_provisional_reference(struct pbn_lock *lock) +{ + lock->has_provisional_reference = false; +} + +/** + * vdo_release_pbn_lock_provisional_reference() - If the lock is responsible + * for a provisional reference, + * release that reference. + * @lock: The lock. + * @locked_pbn: The PBN covered by the lock. + * @allocator: The block allocator from which to release the reference. + * + * This method is called when the lock is released. + */ +void vdo_release_pbn_lock_provisional_reference(struct pbn_lock *lock, + physical_block_number_t locked_pbn, + struct block_allocator *allocator) +{ + if (vdo_pbn_lock_has_provisional_reference(lock)) { + vdo_release_block_reference(allocator, + locked_pbn, + lock->implementation->release_reason); + vdo_unassign_pbn_lock_provisional_reference(lock); + } +} diff --git a/vdo/pbn-lock.h b/vdo/pbn-lock.h new file mode 100644 index 00000000..e9df2c9b --- /dev/null +++ b/vdo/pbn-lock.h @@ -0,0 +1,90 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright Red Hat + */ + +#ifndef PBN_LOCK_H +#define PBN_LOCK_H + +#include + +#include "kernel-types.h" +#include "types.h" + +/* + * The type of a PBN lock. + */ +enum pbn_lock_type { + VIO_READ_LOCK, + VIO_WRITE_LOCK, + VIO_BLOCK_MAP_WRITE_LOCK, +}; + +struct pbn_lock_implementation; + +/* + * A PBN lock. + */ +struct pbn_lock { + /* The implementation of the lock */ + const struct pbn_lock_implementation *implementation; + + /* The number of VIOs holding or sharing this lock */ + vio_count_t holder_count; + /* + * The number of compressed block writers holding a share of this lock + * while they are acquiring a reference to the PBN. + */ + uint8_t fragment_locks; + + /* + * Whether the locked PBN has been provisionally referenced on behalf of + * the lock holder. + */ + bool has_provisional_reference; + + /* + * For read locks, the number of references that were known to be + * available on the locked block at the time the lock was acquired. + */ + uint8_t increment_limit; + + /* + * For read locks, the number of data_vios that have tried to claim one + * of the available increments during the lifetime of the lock. Each + * claim will first increment this counter, so it can exceed the + * increment limit. + */ + atomic_t increments_claimed; +}; + +void vdo_initialize_pbn_lock(struct pbn_lock *lock, enum pbn_lock_type type); + +bool __must_check vdo_is_pbn_read_lock(const struct pbn_lock *lock); + +void +vdo_downgrade_pbn_write_lock(struct pbn_lock *lock, bool compressed_write); + +bool __must_check vdo_claim_pbn_lock_increment(struct pbn_lock *lock); + +/** + * vdo_pbn_lock_has_provisional_reference() - Check whether a PBN lock + * has a provisional reference. + * @lock: The PBN lock. + */ +static inline bool +vdo_pbn_lock_has_provisional_reference(struct pbn_lock *lock) +{ + return ((lock != NULL) && lock->has_provisional_reference); +} + +void vdo_assign_pbn_lock_provisional_reference(struct pbn_lock *lock); + +void vdo_unassign_pbn_lock_provisional_reference(struct pbn_lock *lock); + +void +vdo_release_pbn_lock_provisional_reference(struct pbn_lock *lock, + physical_block_number_t locked_pbn, + struct block_allocator *allocator); + +#endif /* PBN_LOCK_H */ diff --git a/vdo/pbnLock.c b/vdo/pbnLock.c deleted file mode 100644 index 101b06b8..00000000 --- a/vdo/pbnLock.c +++ /dev/null @@ -1,155 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/pbnLock.c#9 $ - */ - -#include "pbnLock.h" - -#include "logger.h" -#include "permassert.h" - -#include "blockAllocator.h" -#include "packedReferenceBlock.h" - -struct pbn_lock_implementation { - enum pbn_lock_type type; - const char *name; - const char *release_reason; -}; - -/** - * This array must have an entry for every pbn_lock_type value. - **/ -static const struct pbn_lock_implementation LOCK_IMPLEMENTATIONS[] = { - [VIO_READ_LOCK] = { - .type = VIO_READ_LOCK, - .name = "read", - .release_reason = "candidate duplicate", - }, - [VIO_WRITE_LOCK] = { - .type = VIO_WRITE_LOCK, - .name = "write", - .release_reason = "newly allocated", - }, - [VIO_COMPRESSED_WRITE_LOCK] = { - .type = VIO_COMPRESSED_WRITE_LOCK, - .name = "compressed write", - .release_reason = "failed compression", - }, - [VIO_BLOCK_MAP_WRITE_LOCK] = { - .type = VIO_BLOCK_MAP_WRITE_LOCK, - .name = "block map write", - .release_reason = "block map write", - }, -}; - -/**********************************************************************/ -static inline bool has_lock_type(const struct pbn_lock *lock, - enum pbn_lock_type type) -{ - return (lock->implementation == &LOCK_IMPLEMENTATIONS[type]); -} - -/**********************************************************************/ -bool is_vdo_pbn_read_lock(const struct pbn_lock *lock) -{ - return has_lock_type(lock, VIO_READ_LOCK); -} - -/**********************************************************************/ -static inline void set_pbn_lock_type(struct pbn_lock *lock, - enum pbn_lock_type type) -{ - lock->implementation = &LOCK_IMPLEMENTATIONS[type]; -} - -/**********************************************************************/ -void initialize_vdo_pbn_lock(struct pbn_lock *lock, enum pbn_lock_type type) -{ - lock->holder_count = 0; - set_pbn_lock_type(lock, type); -} - -/**********************************************************************/ -void downgrade_vdo_pbn_write_lock(struct pbn_lock *lock) -{ - ASSERT_LOG_ONLY(!is_vdo_pbn_read_lock(lock), - "PBN lock must not already have been downgraded"); - ASSERT_LOG_ONLY(!has_lock_type(lock, VIO_BLOCK_MAP_WRITE_LOCK), - "must not downgrade block map write locks"); - ASSERT_LOG_ONLY(lock->holder_count == 1, - "PBN write lock should have one holder but has %u", - lock->holder_count); - if (has_lock_type(lock, VIO_WRITE_LOCK)) { - // data_vio write locks are downgraded in place--the writer - // retains the hold on the lock. They've already had a single - // incRef journaled. - lock->increment_limit = MAXIMUM_REFERENCE_COUNT - 1; - } else { - // Compressed block write locks are downgraded when they are - // shared with all their hash locks. The writer is releasing - // its hold on the lock. - lock->holder_count = 0; - lock->increment_limit = MAXIMUM_REFERENCE_COUNT; - } - set_pbn_lock_type(lock, VIO_READ_LOCK); -} - -/**********************************************************************/ -bool claim_vdo_pbn_lock_increment(struct pbn_lock *lock) -{ - /* - * Claim the next free reference atomically since hash locks from - * multiple hash zone threads might be concurrently deduplicating - * against a single PBN lock on compressed block. As long as hitting - * the increment limit will lead to the PBN lock being released in a - * sane time-frame, we won't overflow a 32-bit claim counter, allowing - * a simple add instead of a compare-and-swap. - */ - uint32_t claim_number = - (uint32_t) atomic_add_return(1, &lock->increments_claimed); - return (claim_number <= lock->increment_limit); -} - -/**********************************************************************/ -void assign_vdo_pbn_lock_provisional_reference(struct pbn_lock *lock) -{ - ASSERT_LOG_ONLY(!lock->has_provisional_reference, - "lock does not have a provisional reference"); - lock->has_provisional_reference = true; -} - -/**********************************************************************/ -void unassign_vdo_pbn_lock_provisional_reference(struct pbn_lock *lock) -{ - lock->has_provisional_reference = false; -} - -/**********************************************************************/ -void release_vdo_pbn_lock_provisional_reference(struct pbn_lock *lock, - physical_block_number_t locked_pbn, - struct block_allocator *allocator) -{ - if (vdo_pbn_lock_has_provisional_reference(lock)) { - release_vdo_block_reference(allocator, - locked_pbn, - lock->implementation->release_reason); - unassign_vdo_pbn_lock_provisional_reference(lock); - } -} diff --git a/vdo/pbnLock.h b/vdo/pbnLock.h deleted file mode 100644 index 4d91d7b4..00000000 --- a/vdo/pbnLock.h +++ /dev/null @@ -1,153 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/pbnLock.h#10 $ - */ - -#ifndef PBN_LOCK_H -#define PBN_LOCK_H - -#include - -#include "types.h" - -/** - * The type of a PBN lock. - **/ -enum pbn_lock_type { - VIO_READ_LOCK = 0, - VIO_WRITE_LOCK, - VIO_COMPRESSED_WRITE_LOCK, - VIO_BLOCK_MAP_WRITE_LOCK, -}; - -struct pbn_lock_implementation; - -/** - * A PBN lock. - **/ -struct pbn_lock { - /** The implementation of the lock */ - const struct pbn_lock_implementation *implementation; - - /** The number of VIOs holding or sharing this lock */ - vio_count_t holder_count; - /** - * The number of compressed block writers holding a share of this lock - * while they are acquiring a reference to the PBN. - **/ - uint8_t fragment_locks; - - /** - * Whether the locked PBN has been provisionally referenced on behalf of - * the lock holder. - **/ - bool has_provisional_reference; - - /** - * For read locks, the number of references that were known to be - * available on the locked block at the time the lock was acquired. - **/ - uint8_t increment_limit; - - /** - * For read locks, the number of data_vios that have tried to claim one - * of the available increments during the lifetime of the lock. Each - * claim will first increment this counter, so it can exceed the - * increment limit. - **/ - atomic_t increments_claimed; -}; - -/** - * Initialize a pbn_lock. - * - * @param lock The lock to initialize - * @param type The type of the lock - **/ -void initialize_vdo_pbn_lock(struct pbn_lock *lock, enum pbn_lock_type type); - -/** - * Check whether a pbn_lock is a read lock. - * - * @param lock The lock to check - * - * @return true if the lock is a read lock - **/ -bool __must_check is_vdo_pbn_read_lock(const struct pbn_lock *lock); - -/** - * Downgrade a PBN write lock to a PBN read lock. The lock holder count is - * cleared and the caller is responsible for setting the new count. - * - * @param lock The PBN write lock to downgrade - **/ -void downgrade_vdo_pbn_write_lock(struct pbn_lock *lock); - -/** - * Try to claim one of the available reference count increments on a read - * lock. Claims may be attempted from any thread. A claim is only valid until - * the PBN lock is released. - * - * @param lock The PBN read lock from which to claim an increment - * - * @return true if the claim succeeded, guaranteeing one - * increment can be made without overflowing the PBN's reference count - **/ -bool __must_check claim_vdo_pbn_lock_increment(struct pbn_lock *lock); - -/** - * Check whether a PBN lock has a provisional reference. - * - * @param lock The PBN lock - **/ -static inline bool -vdo_pbn_lock_has_provisional_reference(struct pbn_lock *lock) -{ - return ((lock != NULL) && lock->has_provisional_reference); -} - -/** - * Inform a PBN lock that it is responsible for a provisional reference. - * - * @param lock The PBN lock - **/ -void assign_vdo_pbn_lock_provisional_reference(struct pbn_lock *lock); - -/** - * Inform a PBN lock that it is no longer responsible for a provisional - * reference. - * - * @param lock The PBN lock - **/ -void unassign_vdo_pbn_lock_provisional_reference(struct pbn_lock *lock); - -/** - * If the lock is responsible for a provisional reference, release that - * reference. This method is called when the lock is released. - * - * @param lock The lock - * @param locked_pbn The PBN covered by the lock - * @param allocator The block allocator from which to release the reference - **/ -void -release_vdo_pbn_lock_provisional_reference(struct pbn_lock *lock, - physical_block_number_t locked_pbn, - struct block_allocator *allocator); - -#endif /* PBN_LOCK_H */ diff --git a/vdo/pbnLockPool.c b/vdo/pbnLockPool.c deleted file mode 100644 index 44ca764d..00000000 --- a/vdo/pbnLockPool.c +++ /dev/null @@ -1,144 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/pbnLockPool.c#13 $ - */ - -#include "pbnLockPool.h" - -#include - -#include "logger.h" -#include "memoryAlloc.h" -#include "permassert.h" - -#include "pbnLock.h" -#include "statusCodes.h" - -/** - * Unused (idle) PBN locks are kept in a list. Just like in a malloc - * implementation, the lock structure is unused memory, so we can save a bit - * of space (and not pollute the lock structure proper) by using a union to - * overlay the lock structure with the free list. - **/ -typedef union { - /** Only used while locks are in the pool */ - struct list_head entry; - /** Only used while locks are not in the pool */ - struct pbn_lock lock; -} idle_pbn_lock; - -/** - * The lock pool is little more than the memory allocated for the locks. - **/ -struct pbn_lock_pool { - /** The number of locks allocated for the pool */ - size_t capacity; - /** The number of locks currently borrowed from the pool */ - size_t borrowed; - /** A list containing all idle PBN lock instances */ - struct list_head idle_list; - /** The memory for all the locks allocated by this pool */ - idle_pbn_lock locks[]; -}; - -/**********************************************************************/ -int make_vdo_pbn_lock_pool(size_t capacity, struct pbn_lock_pool **pool_ptr) -{ - size_t i; - struct pbn_lock_pool *pool; - int result = UDS_ALLOCATE_EXTENDED(struct pbn_lock_pool, capacity, - idle_pbn_lock, __func__, &pool); - if (result != VDO_SUCCESS) { - return result; - } - - pool->capacity = capacity; - pool->borrowed = capacity; - INIT_LIST_HEAD(&pool->idle_list); - - for (i = 0; i < capacity; i++) { - return_vdo_pbn_lock_to_pool(pool, &pool->locks[i].lock); - } - - *pool_ptr = pool; - return VDO_SUCCESS; -} - -/**********************************************************************/ -void free_vdo_pbn_lock_pool(struct pbn_lock_pool *pool) -{ - if (pool == NULL) { - return; - } - - ASSERT_LOG_ONLY(pool->borrowed == 0, - "All PBN locks must be returned to the pool before it is freed, but %zu locks are still on loan", - pool->borrowed); - UDS_FREE(pool); -} - -/**********************************************************************/ -int borrow_vdo_pbn_lock_from_pool(struct pbn_lock_pool *pool, - enum pbn_lock_type type, - struct pbn_lock **lock_ptr) -{ - int result; - struct list_head *idle_entry; - idle_pbn_lock *idle; - - if (pool->borrowed >= pool->capacity) { - return uds_log_error_strerror(VDO_LOCK_ERROR, - "no free PBN locks left to borrow"); - } - pool->borrowed += 1; - - result = ASSERT(!list_empty(&pool->idle_list), - "idle list should not be empty if pool not at capacity"); - if (result != VDO_SUCCESS) { - return result; - } - - idle_entry = pool->idle_list.prev; - list_del(idle_entry); - memset(idle_entry, 0, sizeof(*idle_entry)); - - idle = list_entry(idle_entry, idle_pbn_lock, entry); - initialize_vdo_pbn_lock(&idle->lock, type); - - *lock_ptr = &idle->lock; - return VDO_SUCCESS; -} - -/**********************************************************************/ -void return_vdo_pbn_lock_to_pool(struct pbn_lock_pool *pool, - struct pbn_lock *lock) -{ - idle_pbn_lock *idle; - - // A bit expensive, but will promptly catch some use-after-free errors. - memset(lock, 0, sizeof(*lock)); - - idle = container_of(lock, idle_pbn_lock, lock); - INIT_LIST_HEAD(&idle->entry); - list_add_tail(&idle->entry, &pool->idle_list); - - ASSERT_LOG_ONLY(pool->borrowed > 0, - "shouldn't return more than borrowed"); - pool->borrowed -= 1; -} diff --git a/vdo/pbnLockPool.h b/vdo/pbnLockPool.h deleted file mode 100644 index fef9b5fd..00000000 --- a/vdo/pbnLockPool.h +++ /dev/null @@ -1,77 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/pbnLockPool.h#9 $ - */ - -#ifndef PBN_LOCK_POOL_H -#define PBN_LOCK_POOL_H - -#include "pbnLock.h" -#include "types.h" - -struct pbn_lock_pool; - -/** - * Create a new PBN lock pool and all the lock instances it can loan out. - * - * @param [in] capacity The number of PBN locks to allocate for the pool - * @param [out] pool_ptr A pointer to receive the new pool - * - * @return a VDO_SUCCESS or an error code - **/ -int __must_check -make_vdo_pbn_lock_pool(size_t capacity, struct pbn_lock_pool **pool_ptr); - -/** - * Free a PBN lock pool. This also frees all all the PBN locks it allocated, - * so the caller must ensure that all locks have been returned to the pool. - * - * @param pool The lock pool to free - **/ -void free_vdo_pbn_lock_pool(struct pbn_lock_pool *pool); - -/** - * Borrow a PBN lock from the pool and initialize it with the provided type. - * Pools do not grow on demand or allocate memory, so this will fail if the - * pool is empty. Borrowed locks are still associated with this pool and must - * be returned to only this pool. - * - * @param [in] pool The pool from which to borrow - * @param [in] type The type with which to initialize the lock - * @param [out] lock_ptr A pointer to receive the borrowed lock - * - * @return VDO_SUCCESS, or VDO_LOCK_ERROR if the pool is empty - **/ -int __must_check -borrow_vdo_pbn_lock_from_pool(struct pbn_lock_pool *pool, - enum pbn_lock_type type, - struct pbn_lock **lock_ptr); - -/** - * Return to the pool a lock that was borrowed from it. It must be the last - * live reference, as if the memory were being freed (the lock memory will - * re-initialized or zeroed). - * - * @param pool The pool from which the lock was borrowed - * @param lock The last reference to the lock being returned - **/ -void return_vdo_pbn_lock_to_pool(struct pbn_lock_pool *pool, - struct pbn_lock *lock); - -#endif // PBN_LOCK_POOL_H diff --git a/vdo/permassert.c b/vdo/permassert.c new file mode 100644 index 00000000..36bbc03a --- /dev/null +++ b/vdo/permassert.c @@ -0,0 +1,39 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright Red Hat + */ + +#include "permassert.h" + +#include "errors.h" +#include "logger.h" + + +int uds_assertion_failed(const char *expression_string, + int code, + const char *module_name, + const char *file_name, + int line_number, + const char *format, + ...) +{ + va_list args; + + va_start(args, format); + + uds_log_embedded_message(UDS_LOG_ERR, + module_name, + "assertion \"", + format, + args, + "\" (%s) failed at %s:%d", + expression_string, + file_name, + line_number); + uds_log_backtrace(UDS_LOG_ERR); + + + va_end(args); + + return code; +} diff --git a/uds/permassert.h b/vdo/permassert.h similarity index 82% rename from uds/permassert.h rename to vdo/permassert.h index cf8faa0b..06890ad1 100644 --- a/uds/permassert.h +++ b/vdo/permassert.h @@ -1,22 +1,6 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/krusty/src/uds/permassert.h#10 $ */ #ifndef PERMASSERT_H @@ -113,13 +97,15 @@ static INLINE int __must_check uds_must_use(int value) */ #define ASSERT_FALSE(...) ASSERT(false, __VA_ARGS__) -#define STATIC_ASSERT(expr) \ - do { \ - switch (0) { \ - case 0: \ - case expr:; \ - default:; \ - } \ +#define STATIC_ASSERT(expr) \ + do { \ + switch (0) { \ + case 0:; \ + fallthrough;\ + case expr:; \ + fallthrough;\ + default:; \ + } \ } while (0) #define STATIC_ASSERT_SIZEOF(type, expected_size) \ diff --git a/vdo/physical-zone.c b/vdo/physical-zone.c new file mode 100644 index 00000000..b130be50 --- /dev/null +++ b/vdo/physical-zone.c @@ -0,0 +1,398 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright Red Hat + */ + +#include "physical-zone.h" + +#include "memory-alloc.h" +#include "permassert.h" + +#include "block-allocator.h" +#include "block-map.h" +#include "completion.h" +#include "constants.h" +#include "data-vio.h" +#include "flush.h" +#include "hash-lock.h" +#include "int-map.h" +#include "pbn-lock.h" +#include "pbn-lock-pool.h" +#include "slab-depot.h" +#include "slab-scrubber.h" +#include "vdo.h" + +enum { + /* Each user data_vio needs a PBN read lock and write lock. */ + LOCK_POOL_CAPACITY = 2 * MAXIMUM_VDO_USER_VIOS, +}; + +/** + * initialize_zone() - Initialize a physical zone. + * @vdo: The vdo to which the zone will belong. + * @zones: The physical_zones to which the zone being initialized belongs + * + * Return: VDO_SUCCESS or an error code. + */ +static int initialize_zone(struct vdo *vdo, + struct physical_zones *zones) +{ + int result; + zone_count_t zone_number = zones->zone_count; + struct physical_zone *zone = &zones->zones[zone_number]; + + result = make_int_map(VDO_LOCK_MAP_CAPACITY, 0, &zone->pbn_operations); + if (result != VDO_SUCCESS) { + return result; + } + + result = vdo_make_pbn_lock_pool(LOCK_POOL_CAPACITY, &zone->lock_pool); + if (result != VDO_SUCCESS) { + free_int_map(zone->pbn_operations); + return result; + } + + zone->zone_number = zone_number; + zone->thread_id = vdo->thread_config->physical_threads[zone_number]; + zone->allocator = vdo->depot->allocators[zone_number]; + zone->next = &zones->zones[(zone_number + 1) % + vdo->thread_config->physical_zone_count]; + result = vdo_make_default_thread(vdo, zone->thread_id); + if (result != VDO_SUCCESS) { + vdo_free_pbn_lock_pool(UDS_FORGET(zone->lock_pool)); + free_int_map(zone->pbn_operations); + return result; + } + return result; +} + +/** + * vdo_make_physical_zones() - Make the physical zones for a vdo. + * @vdo: The vdo being constructed + * @zones_ptr: A pointer to hold the zones + * + * Return: VDO_SUCCESS or an error code. + **/ +int vdo_make_physical_zones(struct vdo *vdo, struct physical_zones **zones_ptr) +{ + struct physical_zones *zones; + int result; + zone_count_t zone_count = vdo->thread_config->physical_zone_count; + + if (zone_count == 0) { + return VDO_SUCCESS; + } + + result = UDS_ALLOCATE_EXTENDED(struct physical_zones, + zone_count, + struct physical_zone, + __func__, + &zones); + if (result != VDO_SUCCESS) { + return result; + } + + for (zones->zone_count = 0; + zones->zone_count < zone_count; + zones->zone_count++) { + result = initialize_zone(vdo, zones); + if (result != VDO_SUCCESS) { + vdo_free_physical_zones(zones); + return result; + } + } + + *zones_ptr = zones; + return VDO_SUCCESS; +} + +/** + * vdo_free_physical_zones() - Destroy the physical zones. + * @zones: The zones to free. + */ +void vdo_free_physical_zones(struct physical_zones *zones) +{ + zone_count_t index; + + if (zones == NULL) { + return; + } + + for (index = 0; index < zones->zone_count; index++) { + struct physical_zone *zone = &zones->zones[index]; + + vdo_free_pbn_lock_pool(UDS_FORGET(zone->lock_pool)); + free_int_map(UDS_FORGET(zone->pbn_operations)); + } + + UDS_FREE(zones); +} + +/** + * vdo_get_physical_zone_pbn_lock() - Get the lock on a PBN if one exists. + * @zone: The physical zone responsible for the PBN. + * @pbn: The physical block number whose lock is desired. + * + * Return: The lock or NULL if the PBN is not locked. + */ +struct pbn_lock *vdo_get_physical_zone_pbn_lock(struct physical_zone *zone, + physical_block_number_t pbn) +{ + return ((zone == NULL) ? NULL : int_map_get(zone->pbn_operations, pbn)); +} + +/** + * vdo_attempt_physical_zone_pbn_lock() - Attempt to lock a physical block in + * the zone responsible for it. + * @zone: The physical zone responsible for the PBN. + * @pbn: The physical block number to lock. + * @type: The type with which to initialize a new lock. + * @lock_ptr: A pointer to receive the lock, existing or new. + * + * If the PBN is already locked, the existing lock will be returned. + * Otherwise, a new lock instance will be borrowed from the pool, initialized, + * and returned. The lock owner will be NULL for a new lock acquired by the + * caller, who is responsible for setting that field promptly. The lock owner + * will be non-NULL when there is already an existing lock on the PBN. + * + * Return: VDO_SUCCESS or an error. + */ +int vdo_attempt_physical_zone_pbn_lock(struct physical_zone *zone, + physical_block_number_t pbn, + enum pbn_lock_type type, + struct pbn_lock **lock_ptr) +{ + /* + * Borrow and prepare a lock from the pool so we don't have to do two + * int_map accesses in the common case of no lock contention. + */ + struct pbn_lock *lock, *new_lock; + int result = vdo_borrow_pbn_lock_from_pool(zone->lock_pool, type, + &new_lock); + if (result != VDO_SUCCESS) { + ASSERT_LOG_ONLY(false, + "must always be able to borrow a PBN lock"); + return result; + } + + result = int_map_put(zone->pbn_operations, pbn, new_lock, false, + (void **) &lock); + if (result != VDO_SUCCESS) { + vdo_return_pbn_lock_to_pool(zone->lock_pool, new_lock); + return result; + } + + if (lock != NULL) { + /* The lock is already held, so we don't need the borrowed one. */ + vdo_return_pbn_lock_to_pool(zone->lock_pool, + UDS_FORGET(new_lock)); + result = ASSERT(lock->holder_count > 0, + "physical block %llu lock held", + (unsigned long long) pbn); + if (result != VDO_SUCCESS) { + return result; + } + *lock_ptr = lock; + } else { + *lock_ptr = new_lock; + } + return VDO_SUCCESS; +} + +/** + * allocate_and_lock_block() - Attempt to allocate a block from this zone. + * @allocation: The struct allocation of the data_vio attempting to allocate. + * + * If a block is allocated, the recipient will also hold a lock on it. + * + * Return: VDO_SUCESSS if a block was allocated, or an error code. + */ +static int allocate_and_lock_block(struct allocation *allocation) +{ + int result; + struct pbn_lock *lock; + + ASSERT_LOG_ONLY(allocation->lock == NULL, + "must not allocate a block while already holding a lock on one"); + + result = vdo_allocate_block(allocation->zone->allocator, + &allocation->pbn); + if (result != VDO_SUCCESS) { + return result; + } + + result = vdo_attempt_physical_zone_pbn_lock(allocation->zone, + allocation->pbn, + allocation->write_lock_type, + &lock); + if (result != VDO_SUCCESS) { + return result; + } + + if (lock->holder_count > 0) { + /* This block is already locked, which should be impossible. */ + return uds_log_error_strerror(VDO_LOCK_ERROR, + "Newly allocated block %llu was spuriously locked (holder_count=%u)", + (unsigned long long) allocation->pbn, + lock->holder_count); + } + + /* We've successfully acquired a new lock, so mark it as ours. */ + lock->holder_count += 1; + allocation->lock = lock; + vdo_assign_pbn_lock_provisional_reference(lock); + return VDO_SUCCESS; +} + +/** + * retry_allocation() - Retry allocating a block now that we're done waiting + * for scrubbing. + * @waiter: The allocating_vio that was waiting to allocate. + * @context: The context (unused). + */ +static void retry_allocation(struct waiter *waiter, + void *context __always_unused) +{ + struct data_vio *data_vio = waiter_as_data_vio(waiter); + + /* Now that some slab has scrubbed, restart the allocation process. */ + data_vio->allocation.wait_for_clean_slab = false; + data_vio->allocation.first_allocation_zone = + data_vio->allocation.zone->zone_number; + continue_data_vio(data_vio, VDO_SUCCESS); +} + +/** + * continue_allocating() - Continue searching for an allocation by enqueuing + * to wait for scrubbing or switching to the next + * zone. + * @data_vio: The data_vio attempting to get an allocation. + * + * This method should only be called from the error handler set in + * data_vio_allocate_data_block. + * + * Return: true if the allocation process has continued in another zone. + */ +static bool continue_allocating(struct data_vio *data_vio) +{ + struct allocation *allocation = &data_vio->allocation; + struct physical_zone *zone = allocation->zone; + struct vdo_completion *completion = data_vio_as_completion(data_vio); + int result = VDO_SUCCESS; + bool was_waiting = allocation->wait_for_clean_slab; + bool tried_all = + (allocation->first_allocation_zone == zone->next->zone_number); + + vdo_reset_completion(completion); + + if (tried_all && !was_waiting) { + /* + * We've already looked in all the zones, and found nothing. + * So go through the zones again, and wait for each to scrub + * before trying to allocate. + */ + allocation->wait_for_clean_slab = true; + allocation->first_allocation_zone = zone->zone_number; + } + + if (allocation->wait_for_clean_slab) { + struct waiter *waiter = data_vio_as_waiter(data_vio); + struct slab_scrubber *scrubber + = zone->allocator->slab_scrubber; + + waiter->callback = retry_allocation; + result = vdo_enqueue_clean_slab_waiter(scrubber, waiter); + + if (result == VDO_SUCCESS) { + /* We've enqueued to wait for a slab to be scrubbed. */ + return true; + } + + if ((result != VDO_NO_SPACE) || (was_waiting && tried_all)) { + vdo_set_completion_result(completion, result); + return false; + } + } + + allocation->zone = zone->next; + completion->callback_thread_id = allocation->zone->thread_id; + vdo_continue_completion(completion, VDO_SUCCESS); + return true; +} + +/** + * vdo_allocate_block_in_zone() - Attempt to allocate a block in the current + * physical zone, and if that fails try the + * next if possible. + * @data_vio: The data_vio needing an allocation. + * + * Return: true if a block was allocated, if not the data_vio will have been + * dispatched so the caller must not touch it. + */ +bool vdo_allocate_block_in_zone(struct data_vio *data_vio) +{ + int result = allocate_and_lock_block(&data_vio->allocation); + + if (result == VDO_SUCCESS) { + return true; + } + + if ((result != VDO_NO_SPACE) || !continue_allocating(data_vio)) { + continue_data_vio(data_vio, result); + } + + return false; +} + +/** + * vdo_release_physical_zone_pbn_lock() - Release a physical block lock if it + * is held and return it to the lock + * pool. + * @zone: The physical zone in which the lock was obtained. + * @locked_pbn: The physical block number to unlock. + * @lock: The lock being released. + * + * It must be the last live reference, as if the memory were being freed (the + * lock memory will re-initialized or zeroed). + */ +void vdo_release_physical_zone_pbn_lock(struct physical_zone *zone, + physical_block_number_t locked_pbn, + struct pbn_lock *lock) +{ + struct pbn_lock *holder; + + if (lock == NULL) { + return; + } + + ASSERT_LOG_ONLY(lock->holder_count > 0, + "should not be releasing a lock that is not held"); + + lock->holder_count -= 1; + if (lock->holder_count > 0) { + /* + * The lock was shared and is still referenced, so don't + * release it yet. + */ + return; + } + + holder = int_map_remove(zone->pbn_operations, locked_pbn); + ASSERT_LOG_ONLY((lock == holder), + "physical block lock mismatch for block %llu", + (unsigned long long) locked_pbn); + + vdo_release_pbn_lock_provisional_reference(lock, locked_pbn, + zone->allocator); + vdo_return_pbn_lock_to_pool(zone->lock_pool, lock); +} + +/** + * vdo_dump_physical_zone() - Dump information about a physical zone to the + * log for debugging. + * @zone: The zone to dump. + */ +void vdo_dump_physical_zone(const struct physical_zone *zone) +{ + vdo_dump_block_allocator(zone->allocator); +} diff --git a/vdo/physical-zone.h b/vdo/physical-zone.h new file mode 100644 index 00000000..35b4d6ac --- /dev/null +++ b/vdo/physical-zone.h @@ -0,0 +1,59 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright Red Hat + */ + +#ifndef PHYSICAL_ZONE_H +#define PHYSICAL_ZONE_H + +#include "kernel-types.h" +#include "pbn-lock.h" +#include "types.h" + +struct physical_zone { + /* Which physical zone this is */ + zone_count_t zone_number; + /* The thread ID for this zone */ + thread_id_t thread_id; + /* In progress operations keyed by PBN */ + struct int_map *pbn_operations; + /* Pool of unused pbn_lock instances */ + struct pbn_lock_pool *lock_pool; + /* The block allocator for this zone */ + struct block_allocator *allocator; + /* The next zone from which to attempt an allocation */ + struct physical_zone *next; +}; + +struct physical_zones { + /* The number of zones */ + zone_count_t zone_count; + /* The physical zones themselves */ + struct physical_zone zones[]; +}; + +int __must_check +vdo_make_physical_zones(struct vdo *vdo, struct physical_zones **zones_ptr); + +void vdo_free_physical_zones(struct physical_zones *zones); + +struct pbn_lock * __must_check +vdo_get_physical_zone_pbn_lock(struct physical_zone *zone, + physical_block_number_t pbn); + +int __must_check +vdo_attempt_physical_zone_pbn_lock(struct physical_zone *zone, + physical_block_number_t pbn, + enum pbn_lock_type type, + struct pbn_lock **lock_ptr); + +bool __must_check vdo_allocate_block_in_zone(struct data_vio *data_vio); + +void vdo_release_physical_zone_pbn_lock(struct physical_zone *zone, + physical_block_number_t locked_pbn, + struct pbn_lock *lock); + +void vdo_dump_physical_zone(const struct physical_zone *zone); + + +#endif /* PHYSICAL_ZONE_H */ diff --git a/vdo/physicalZone.c b/vdo/physicalZone.c deleted file mode 100644 index 40b6ed24..00000000 --- a/vdo/physicalZone.c +++ /dev/null @@ -1,207 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/physicalZone.c#28 $ - */ - -#include "physicalZone.h" - -#include "memoryAlloc.h" -#include "permassert.h" - -#include "blockAllocator.h" -#include "blockMap.h" -#include "completion.h" -#include "constants.h" -#include "dataVIO.h" -#include "flush.h" -#include "hashLock.h" -#include "intMap.h" -#include "pbnLock.h" -#include "pbnLockPool.h" -#include "slabDepot.h" -#include "vdoInternal.h" - -enum { - // Each user data_vio needs a PBN read lock and write lock, and each - // packer output bin has an allocating_vio that needs a PBN write lock. - LOCK_POOL_CAPACITY = 2 * MAXIMUM_VDO_USER_VIOS + DEFAULT_PACKER_OUTPUT_BINS, -}; - -struct physical_zone { - /** Which physical zone this is */ - zone_count_t zone_number; - /** The thread ID for this zone */ - thread_id_t thread_id; - /** In progress operations keyed by PBN */ - struct int_map *pbn_operations; - /** Pool of unused pbn_lock instances */ - struct pbn_lock_pool *lock_pool; - /** The block allocator for this zone */ - struct block_allocator *allocator; -}; - -/**********************************************************************/ -int make_vdo_physical_zone(struct vdo *vdo, - zone_count_t zone_number, - struct physical_zone **zone_ptr) -{ - struct physical_zone *zone; - int result = UDS_ALLOCATE(1, struct physical_zone, __func__, &zone); - if (result != VDO_SUCCESS) { - return result; - } - - result = make_int_map(VDO_LOCK_MAP_CAPACITY, 0, &zone->pbn_operations); - if (result != VDO_SUCCESS) { - free_vdo_physical_zone(zone); - return result; - } - - result = make_vdo_pbn_lock_pool(LOCK_POOL_CAPACITY, &zone->lock_pool); - if (result != VDO_SUCCESS) { - free_vdo_physical_zone(zone); - return result; - } - - zone->zone_number = zone_number; - zone->thread_id = - vdo_get_physical_zone_thread(get_vdo_thread_config(vdo), - zone_number); - zone->allocator = - vdo_get_block_allocator_for_zone(vdo->depot, zone_number); - - *zone_ptr = zone; - return VDO_SUCCESS; -} - -/**********************************************************************/ -void free_vdo_physical_zone(struct physical_zone *zone) -{ - if (zone == NULL) { - return; - } - - free_vdo_pbn_lock_pool(UDS_FORGET(zone->lock_pool)); - free_int_map(UDS_FORGET(zone->pbn_operations)); - UDS_FREE(zone); -} - -/**********************************************************************/ -zone_count_t get_vdo_physical_zone_number(const struct physical_zone *zone) -{ - return zone->zone_number; -} - -/**********************************************************************/ -thread_id_t get_vdo_physical_zone_thread_id(const struct physical_zone *zone) -{ - return zone->thread_id; -} - -/**********************************************************************/ -struct block_allocator * -get_vdo_physical_zone_block_allocator(const struct physical_zone *zone) -{ - return zone->allocator; -} - -/**********************************************************************/ -struct pbn_lock *get_vdo_physical_zone_pbn_lock(struct physical_zone *zone, - physical_block_number_t pbn) -{ - return ((zone == NULL) ? NULL : int_map_get(zone->pbn_operations, pbn)); -} - -/**********************************************************************/ -int attempt_vdo_physical_zone_pbn_lock(struct physical_zone *zone, - physical_block_number_t pbn, - enum pbn_lock_type type, - struct pbn_lock **lock_ptr) -{ - // Borrow and prepare a lock from the pool so we don't have to do two - // int_map accesses in the common case of no lock contention. - struct pbn_lock *lock, *new_lock; - int result = borrow_vdo_pbn_lock_from_pool(zone->lock_pool, type, - &new_lock); - if (result != VDO_SUCCESS) { - ASSERT_LOG_ONLY(false, - "must always be able to borrow a PBN lock"); - return result; - } - - result = int_map_put(zone->pbn_operations, pbn, new_lock, false, - (void **) &lock); - if (result != VDO_SUCCESS) { - return_vdo_pbn_lock_to_pool(zone->lock_pool, new_lock); - return result; - } - - if (lock != NULL) { - // The lock is already held, so we don't need the borrowed one. - return_vdo_pbn_lock_to_pool(zone->lock_pool, - UDS_FORGET(new_lock)); - result = ASSERT(lock->holder_count > 0, - "physical block %llu lock held", - (unsigned long long) pbn); - if (result != VDO_SUCCESS) { - return result; - } - *lock_ptr = lock; - } else { - *lock_ptr = new_lock; - } - return VDO_SUCCESS; -} - -/**********************************************************************/ -void release_vdo_physical_zone_pbn_lock(struct physical_zone *zone, - physical_block_number_t locked_pbn, - struct pbn_lock *lock) -{ - struct pbn_lock *holder; - - if (lock == NULL) { - return; - } - - ASSERT_LOG_ONLY(lock->holder_count > 0, - "should not be releasing a lock that is not held"); - - lock->holder_count -= 1; - if (lock->holder_count > 0) { - // The lock was shared and is still referenced, so don't - // release it yet. - return; - } - - holder = int_map_remove(zone->pbn_operations, locked_pbn); - ASSERT_LOG_ONLY((lock == holder), - "physical block lock mismatch for block %llu", - (unsigned long long) locked_pbn); - - release_vdo_pbn_lock_provisional_reference(lock, locked_pbn, - zone->allocator); - return_vdo_pbn_lock_to_pool(zone->lock_pool, lock); -} - -/**********************************************************************/ -void dump_vdo_physical_zone(const struct physical_zone *zone) -{ - dump_vdo_block_allocator(zone->allocator); -} diff --git a/vdo/physicalZone.h b/vdo/physicalZone.h deleted file mode 100644 index a7dca5af..00000000 --- a/vdo/physicalZone.h +++ /dev/null @@ -1,130 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/physicalZone.h#14 $ - */ - -#ifndef PHYSICAL_ZONE_H -#define PHYSICAL_ZONE_H - -#include "pbnLock.h" -#include "types.h" - -/** - * Create a physical zone. - * - * @param [in] vdo The vdo to which the zone will belong - * @param [in] zone_number The number of the zone to create - * @param [out] zone_ptr A pointer to hold the new physical_zone - * - * @return VDO_SUCCESS or an error code - **/ -int __must_check make_vdo_physical_zone(struct vdo *vdo, - zone_count_t zone_number, - struct physical_zone **zone_ptr); - -/** - * Free a physical zone. - * - * @param zone The zone to free - **/ -void free_vdo_physical_zone(struct physical_zone *zone); - -/** - * Get the zone number of a physical zone. - * - * @param zone The zone - * - * @return The number of the zone - **/ -zone_count_t __must_check -get_vdo_physical_zone_number(const struct physical_zone *zone); - -/** - * Get the ID of a physical zone's thread. - * - * @param zone The zone - * - * @return The zone's thread ID - **/ -thread_id_t __must_check -get_vdo_physical_zone_thread_id(const struct physical_zone *zone); - -/** - * Get the block allocator from a physical zone. - * - * @param zone The zone - * - * @return The zone's allocator - **/ -struct block_allocator * __must_check -get_vdo_physical_zone_block_allocator(const struct physical_zone *zone); - -/** - * Get the lock on a PBN if one exists. - * - * @param zone The physical zone responsible for the PBN - * @param pbn The physical block number whose lock is desired - * - * @return The lock or NULL if the PBN is not locked - **/ -struct pbn_lock * __must_check -get_vdo_physical_zone_pbn_lock(struct physical_zone *zone, - physical_block_number_t pbn); - -/** - * Attempt to lock a physical block in the zone responsible for it. If the PBN - * is already locked, the existing lock will be returned. Otherwise, a new - * lock instance will be borrowed from the pool, initialized, and returned. - * The lock owner will be NULL for a new lock acquired by the caller, who is - * responsible for setting that field promptly. The lock owner will be - * non-NULL when there is already an existing lock on the PBN. - * - * @param [in] zone The physical zone responsible for the PBN - * @param [in] pbn The physical block number to lock - * @param [in] type The type with which to initialize a new lock - * @param [out] lock_ptr A pointer to receive the lock, existing or new - * - * @return VDO_SUCCESS or an error - **/ -int __must_check attempt_vdo_physical_zone_pbn_lock(struct physical_zone *zone, - physical_block_number_t pbn, - enum pbn_lock_type type, - struct pbn_lock **lock_ptr); - -/** - * Release a physical block lock if it is held and return it to the lock pool. - * It must be the last live reference, as if the memory were being freed (the - * lock memory will re-initialized or zeroed). - * - * @param zone The physical zone in which the lock was obtained - * @param locked_pbn The physical block number to unlock - * @param lock The lock being released - **/ -void release_vdo_physical_zone_pbn_lock(struct physical_zone *zone, - physical_block_number_t locked_pbn, - struct pbn_lock *lock); - -/** - * Dump information about a physical zone to the log for debugging. - * - * @param zone The zone to dump - **/ -void dump_vdo_physical_zone(const struct physical_zone *zone); - -#endif // PHYSICAL_ZONE_H diff --git a/vdo/pointer-map.c b/vdo/pointer-map.c new file mode 100644 index 00000000..5c9d8533 --- /dev/null +++ b/vdo/pointer-map.c @@ -0,0 +1,783 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright Red Hat + */ + +/** + * DOC: + * + * Hash table implementation of a map from integers to pointers, implemented + * using the Hopscotch Hashing algorithm by Herlihy, Shavit, and Tzafrir (see + * http://en.wikipedia.org/wiki/Hopscotch_hashing). This implementation does + * not contain any of the locking/concurrency features of the algorithm, just + * the collision resolution scheme. + * + * Hopscotch Hashing is based on hashing with open addressing and linear + * probing. All the entries are stored in a fixed array of buckets, with no + * dynamic allocation for collisions. Unlike linear probing, all the entries + * that hash to a given bucket are stored within a fixed neighborhood starting + * at that bucket. Chaining is effectively represented as a bit vector + * relative to each bucket instead of as pointers or explicit offsets. + * + * When an empty bucket cannot be found within a given neighborhood, + * subsequent neighborhoods are searched, and one or more entries will "hop" + * into those neighborhoods. When this process works, an empty bucket will + * move into the desired neighborhood, allowing the entry to be added. When + * that process fails (typically when the buckets are around 90% full), the + * table must be resized and the all entries rehashed and added to the + * expanded table. + * + * Unlike linear probing, the number of buckets that must be searched in the + * worst case has a fixed upper bound (the size of the neighborhood). Those + * entries occupy a small number of memory cache lines, leading to improved + * use of the cache (fewer misses on both successful and unsuccessful + * searches). Hopscotch hashing outperforms linear probing at much higher load + * factors, so even with the increased memory burden for maintaining the hop + * vectors, less memory is needed to achieve that performance. Hopscotch is + * also immune to "contamination" from deleting entries since entries are + * genuinely removed instead of being replaced by a placeholder. + * + * The published description of the algorithm used a bit vector, but the paper + * alludes to an offset scheme which is used by this implementation. Since the + * entries in the neighborhood are within N entries of the hash bucket at the + * start of the neighborhood, a pair of small offset fields each log2(N) bits + * wide is all that's needed to maintain the hops as a linked list. In order + * to encode "no next hop" (i.e. NULL) as the natural initial value of zero, + * the offsets are biased by one (i.e. 0 => NULL, 1 => offset=0, 2 => + * offset=1, etc.) We can represent neighborhoods of up to 255 entries with + * just 8+8=16 bits per entry. The hop list is sorted by hop offset so the + * first entry in the list is always the bucket closest to the start of the + * neighborhood. + * + * While individual accesses tend to be very fast, the table resize operations + * are very, very expensive. If an upper bound on the latency of adding an + * entry to the table is needed, we either need to ensure the table is + * pre-sized to be large enough so no resize is ever needed, or we'll need to + * develop an approach to incrementally resize the table. + */ + +#include "pointer-map.h" + +#include "errors.h" +#include "logger.h" +#include "memory-alloc.h" +#include "numeric.h" +#include "permassert.h" + +enum { + DEFAULT_CAPACITY = 16, /* the number of neighborhoods in a new table */ + NEIGHBORHOOD = 255, /* the number of buckets in each neighborhood */ + MAX_PROBES = 1024, /* limit on the number of probes for a free bucket */ + NULL_HOP_OFFSET = 0, /* the hop offset value terminating the hop list */ + DEFAULT_LOAD = 75 /* a compromise between memory use and performance */ +}; + +/** + * struct bucket - Hash buckets. + * + * Buckets are packed together to reduce memory usage and improve cache + * efficiency. It would be tempting to encode the hop offsets separately and + * maintain alignment of key/value pairs, but it's crucial to keep the hop + * fields near the buckets that they use them so they'll tend to share cache + * lines. +*/ +struct __packed bucket { + /** + * @first_hop: The biased offset of the first entry in the hop list of + * the neighborhood that hashes to this bucket. + */ + uint8_t first_hop; + /** @next_hop: the biased offset of the next bucket in the hop list. */ + uint8_t next_hop; + /** @key: The key stored in this bucket. */ + const void *key; + /** @value: The value stored in this bucket (NULL if empty). */ + void *value; +}; + +/** + * struct pointer_map - The concrete definition of the opaque pointer_map + * type. + * + * To avoid having to wrap the neighborhoods of the last entries back around + * to the start of the bucket array, we allocate a few more buckets at the end + * of the array instead, which is why capacity and bucket_count are different. + */ +struct pointer_map { + /** @size: The number of entries stored in the map. */ + size_t size; + /** @capacity: The number of neighborhoods in the map. */ + size_t capacity; + /** @bucket_count: The number of buckets in the bucket array. */ + size_t bucket_count; + /** @buckets: The array of hash buckets. */ + struct bucket *buckets; + /** @comparator: The function for comparing keys for equality. */ + pointer_key_comparator *comparator; + /** @hasher: The function for getting a hash code from a key. */ + pointer_key_hasher *hasher; +}; + +/** + * allocate_buckets() - Initialize a pointer_map. + * @map: The map to initialize. + * @capacity: The initial capacity of the map. + * + * Return: UDS_SUCCESS or an error code. + */ +static int allocate_buckets(struct pointer_map *map, size_t capacity) +{ + map->size = 0; + map->capacity = capacity; + + /* + * Allocate NEIGHBORHOOD - 1 extra buckets so the last bucket can have a + * full neighborhood without have to wrap back around to element zero. + */ + map->bucket_count = capacity + (NEIGHBORHOOD - 1); + return UDS_ALLOCATE(map->bucket_count, + struct bucket, + "pointer_map buckets", + &map->buckets); +} + +/** + * make_pointer_map() - Allocate and initialize a pointer_map. + * @initial_capacity: The number of entries the map should initially be + * capable of holding (zero tells the map to use its own + * small default). + * @initial_load: The load factor of the map, expressed as an integer + * percentage (typically in the range 50 to 90, with zero + * telling the map to use its own default). + * @comparator: The function to use to compare the referents of two pointer + * keys for equality. + * @hasher: The function to use obtain the hash code associated with each + * pointer key + * @map_ptr: A pointer to hold the new pointer_map. + * + * Return: UDS_SUCCESS or an error code. + */ +int make_pointer_map(size_t initial_capacity, + unsigned int initial_load, + pointer_key_comparator comparator, + pointer_key_hasher hasher, + struct pointer_map **map_ptr) +{ + int result; + struct pointer_map *map; + size_t capacity; + + /* Use the default initial load if the caller did not specify one. */ + if (initial_load == 0) { + initial_load = DEFAULT_LOAD; + } + if (initial_load > 100) { + return UDS_INVALID_ARGUMENT; + } + + result = UDS_ALLOCATE(1, struct pointer_map, "pointer_map", &map); + if (result != UDS_SUCCESS) { + return result; + } + + map->hasher = hasher; + map->comparator = comparator; + + /* Use the default capacity if the caller did not specify one. */ + capacity = + (initial_capacity > 0) ? initial_capacity : DEFAULT_CAPACITY; + + /* + * Scale up the capacity by the specified initial load factor. + * (i.e to hold 1000 entries at 80% load we need a capacity of 1250) + */ + capacity = capacity * 100 / initial_load; + + result = allocate_buckets(map, capacity); + if (result != UDS_SUCCESS) { + free_pointer_map(UDS_FORGET(map)); + return result; + } + + *map_ptr = map; + return UDS_SUCCESS; +} + +/** + * free_pointer_map() - Free a pointer_map. + * @map: The pointer_map to free. + * + * The map does not own the pointer keys and values stored in the map and they + * are not freed by this call. + */ +void free_pointer_map(struct pointer_map *map) +{ + if (map == NULL) { + return; + } + + UDS_FREE(UDS_FORGET(map->buckets)); + UDS_FREE(UDS_FORGET(map)); +} + +/** + * pointer_map_size() - Get the number of entries stored in a pointer_map. + * @map: The pointer_map to query. + * + * Return: The number of entries in the map. + */ +size_t pointer_map_size(const struct pointer_map *map) +{ + return map->size; +} + +/** + * dereference_hop() - Convert a biased hop offset within a neighborhood to a + * pointer to the bucket it references. + * @neighborhood: The first bucket in the neighborhood. + * @hop_offset: The biased hop offset to the desired bucket. + * + * Return: NULL if hop_offset is zero, otherwise a pointer to the bucket in + * the neighborhood at hop_offset - 1. + */ +static struct bucket *dereference_hop(struct bucket *neighborhood, + unsigned int hop_offset) +{ + if (hop_offset == NULL_HOP_OFFSET) { + return NULL; + } + + STATIC_ASSERT(NULL_HOP_OFFSET == 0); + return &neighborhood[hop_offset - 1]; +} + +/** + * insert_in_hop_list() - Add a bucket into the hop list for the neighborhood, + * inserting it into the list so the hop list remains + * sorted by hop offset. + * @neighborhood: The first bucket in the neighborhood. + * @new_bucket: The bucket to add to the hop list. + */ +static void insert_in_hop_list(struct bucket *neighborhood, + struct bucket *new_bucket) +{ + /* Zero indicates a NULL hop offset, so bias the hop offset by one. */ + int hop_offset = 1 + (new_bucket - neighborhood); + + /* Handle the special case of adding a bucket at the start of the list. */ + int next_hop = neighborhood->first_hop; + + if ((next_hop == NULL_HOP_OFFSET) || (next_hop > hop_offset)) { + new_bucket->next_hop = next_hop; + neighborhood->first_hop = hop_offset; + return; + } + + /* + * Search the hop list for the insertion point that maintains the sort + * order. + */ + for (;;) { + struct bucket *bucket = dereference_hop(neighborhood, next_hop); + + next_hop = bucket->next_hop; + + if ((next_hop == NULL_HOP_OFFSET) || (next_hop > hop_offset)) { + new_bucket->next_hop = next_hop; + bucket->next_hop = hop_offset; + return; + } + } +} + +/** + * select_bucket() - Select and return the hash bucket for a given search key. + * @map: The map to search. + * @key: The mapping key. + */ +static struct bucket *select_bucket(const struct pointer_map *map, + const void *key) +{ + /* + * Scale the 32-bit hash to a bucket index by treating it as a binary + * fraction and multiplying that by the capacity. If the hash is + * uniformly distributed over [0 .. 2^32-1], then (hash * capacity / + * 2^32) should be uniformly distributed over [0 .. capacity-1]. The + * multiply and shift is much faster than a divide (modulus) on X86 + * CPUs. + */ + uint64_t hash = map->hasher(key); + + return &map->buckets[(hash * map->capacity) >> 32]; +} + +/** + * search_hop_list() - Search the hop list. + * @map: The map being searched. + * @bucket: The map bucket to search for the key. + * @key: The mapping key. + * @previous_ptr: if not NULL, a pointer in which to store the bucket in the + * list preceding the one that had the matching key. + * + * Searches the hop list associated with given hash bucket for a given search + * key. If the key is found, returns a pointer to the entry (bucket or + * collision), otherwise returns NULL. + * + * Return: an entry that matches the key, or NULL if not found. + */ +static struct bucket *search_hop_list(struct pointer_map *map, + struct bucket *bucket, + const void *key, + struct bucket **previous_ptr) +{ + struct bucket *previous = NULL; + unsigned int next_hop = bucket->first_hop; + + while (next_hop != NULL_HOP_OFFSET) { + /* + * Check the neighboring bucket indexed by the offset for the + * desired key. + */ + struct bucket *entry = dereference_hop(bucket, next_hop); + + if ((entry->value != NULL) && + map->comparator(key, entry->key)) { + if (previous_ptr != NULL) { + *previous_ptr = previous; + } + return entry; + } + next_hop = entry->next_hop; + previous = entry; + } + return NULL; +} + +/** + * pointer_map_get() - Retrieve the value associated with a given key from the + * pointer_map. + * @map: The pointer_map to query. + * @key: The key to look up (may be NULL if the comparator and hasher + * functions support it). + * + * Return: the value associated with the given key, or NULL if the key is not + * mapped to any value. + */ +void *pointer_map_get(struct pointer_map *map, const void *key) +{ + struct bucket *match = + search_hop_list(map, select_bucket(map, key), key, NULL); + return ((match != NULL) ? match->value : NULL); +} + +/** + * resize_buckets() - Increase the number of hash buckets and rehash all the + * existing entries, storing them in the new buckets. + * @map: The map to resize. + */ +static int resize_buckets(struct pointer_map *map) +{ + int result; + size_t i; + + /* Copy the top-level map data to the stack. */ + struct pointer_map old_map = *map; + + /* Re-initialize the map to be empty and 50% larger. */ + size_t new_capacity = map->capacity / 2 * 3; + + uds_log_info("%s: attempting resize from %zu to %zu, current size=%zu", + __func__, + map->capacity, + new_capacity, + map->size); + result = allocate_buckets(map, new_capacity); + if (result != UDS_SUCCESS) { + *map = old_map; + return result; + } + + /* Populate the new hash table from the entries in the old bucket array. */ + for (i = 0; i < old_map.bucket_count; i++) { + struct bucket *entry = &old_map.buckets[i]; + + if (entry->value == NULL) { + continue; + } + + result = pointer_map_put(map, entry->key, entry->value, + true, NULL); + if (result != UDS_SUCCESS) { + /* + * Destroy the new partial map and restore the map from + * the stack. + */ + UDS_FREE(UDS_FORGET(map->buckets)); + *map = old_map; + return result; + } + } + + /* Destroy the old bucket array. */ + UDS_FREE(UDS_FORGET(old_map.buckets)); + return UDS_SUCCESS; +} + +/** + * find_empty_bucket() - Probe the bucket array starting at the given bucket + * for the next empty bucket, returning a pointer to it. + * @map: The map containing the buckets to search. + * @bucket: The bucket at which to start probing. + * @max_probes: The maximum number of buckets to search. + * + * NULL will be returned if the search reaches the end of the bucket array or + * if the number of linear probes exceeds a specified limit. + * + * Return: The next empty bucket, or NULL if the search failed. + */ +static struct bucket *find_empty_bucket(struct pointer_map *map, + struct bucket *bucket, + unsigned int max_probes) +{ + /* + * Limit the search to either the nearer of the end of the bucket array + * or a fixed distance beyond the initial bucket. + */ + ptrdiff_t remaining = &map->buckets[map->bucket_count] - bucket; + struct bucket *sentinel = + &bucket[min(remaining, (ptrdiff_t) max_probes)]; + + struct bucket *entry; + + for (entry = bucket; entry < sentinel; entry++) { + if (entry->value == NULL) { + return entry; + } + } + return NULL; +} + +/** + * move_empty_bucket() - Move an empty bucket closer to the start of the + * bucket array. + * @map: The map containing the bucket. + * @hole: The empty bucket to fill with an entry that precedes it in one of + * its enclosing neighborhoods. + * + * This searches the neighborhoods that contain the empty bucket for a + * non-empty bucket closer to the start of the array. If such a bucket is + * found, this swaps the two buckets by moving the entry to the empty bucket. + * + * Return: The bucket that was vacated by moving its entry to the provided + * hole, or NULL if no entry could be moved. + */ +static struct bucket *move_empty_bucket(struct pointer_map *map + __attribute__((unused)), + struct bucket *hole) +{ + /* + * Examine every neighborhood that the empty bucket is part of, starting + * with the one in which it is the last bucket. No boundary check is + * needed for the negative array arithmetic since this function is only + * called when hole is at least NEIGHBORHOOD cells deeper into the array + * than a valid bucket. + */ + struct bucket *bucket; + + for (bucket = &hole[1 - NEIGHBORHOOD]; bucket < hole; bucket++) { + /* + * Find the entry that is nearest to the bucket, which means it + * will be nearest to the hash bucket whose neighborhood is + * full. + */ + struct bucket *new_hole = + dereference_hop(bucket, bucket->first_hop); + if (new_hole == NULL) { + /* + * There are no buckets in this neighborhood that are in + * use by this one (they must all be owned by + * overlapping neighborhoods). + */ + continue; + } + + /* + * Skip this bucket if its first entry is actually further away + * than the hole that we're already trying to fill. + */ + if (hole < new_hole) { + continue; + } + + /* + * We've found an entry in this neighborhood that we can "hop" + * further away, moving the hole closer to the hash bucket, if + * not all the way into its neighborhood. + */ + + /* + * The entry that will be the new hole is the first bucket in + * the list, so setting first_hop is all that's needed remove it + * from the list. + */ + bucket->first_hop = new_hole->next_hop; + new_hole->next_hop = NULL_HOP_OFFSET; + + /* Move the entry into the original hole. */ + hole->key = new_hole->key; + hole->value = new_hole->value; + new_hole->value = NULL; + + /* + * Insert the filled hole into the hop list for the + * neighborhood. + */ + insert_in_hop_list(bucket, hole); + return new_hole; + } + + /* We couldn't find an entry to relocate to the hole. */ + return NULL; +} + +/** + * update_mapping() - Find and update any existing mapping for a given key, + * returning the value associated with the key in the + * provided pointer. + * + * @map: The pointer_map to attempt to modify. + * @neighborhood: The first bucket in the neighborhood that would contain the + * search key. + * @key: The key with which to associate the new value. + * @new_value: The value to be associated with the key. + * @update: Whether to overwrite an existing value. + * @old_value_ptr: A pointer in which to store the old value (unmodified if no + * mapping was found). + * + * Return: true if the map contains a mapping for the key, false if it does + * not. + */ +static bool update_mapping(struct pointer_map *map, + struct bucket *neighborhood, + const void *key, + void *new_value, + bool update, + void **old_value_ptr) +{ + struct bucket *bucket = search_hop_list(map, neighborhood, key, NULL); + + if (bucket == NULL) { + /* There is no bucket containing the key in the neighborhood. */ + return false; + } + + /* + * Return the value of the current mapping (if desired) and update the + * mapping with the new value (if desired). + */ + if (old_value_ptr != NULL) { + *old_value_ptr = bucket->value; + } + if (update) { + /* + * We're dropping the old key pointer on the floor here, + * assuming it's a property of the value or that it's otherwise + * safe to just forget. + */ + bucket->key = key; + bucket->value = new_value; + } + return true; +} + +/** + * find_or_make_vacancy() - Find an empty bucket in a specified neighborhood + * for a new mapping or attempt to re-arrange + * mappings so there is such a bucket. + * @map: The pointer_map to search or modify. + * @neighborhood: The first bucket in the neighborhood in which + * an empty bucket is needed for a new mapping. + * + * This operation may fail (returning NULL) if an empty bucket is not + * available or could not be relocated to the neighborhood. + * + * Return: A pointer to an empty bucket in the desired neighborhood, or + * NULL if a vacancy could not be found or arranged. + */ +static struct bucket *find_or_make_vacancy(struct pointer_map *map, + struct bucket *neighborhood) +{ + /* Probe within and beyond the neighborhood for the first empty bucket. */ + struct bucket *hole = find_empty_bucket(map, neighborhood, MAX_PROBES); + + /* + * Keep trying until the empty bucket is in the bucket's neighborhood or + * we are unable to move it any closer by swapping it with a filled + * bucket. + */ + while (hole != NULL) { + int distance = hole - neighborhood; + + if (distance < NEIGHBORHOOD) { + /* + * We've found or relocated an empty bucket close enough + * to the initial hash bucket to be referenced by its + * hop vector. + */ + return hole; + } + + /* + * The nearest empty bucket isn't within the neighborhood that + * must contain the new entry, so try to swap it with bucket + * that is closer. + */ + hole = move_empty_bucket(map, hole); + } + + return NULL; +} + +/** + * pointer_map_put() - Try to associate a value (a pointer) with an integer in + * a pointer_map. + * @map: The pointer_map to attempt to modify. + * @key: The key with which to associate the new value (may be NULL if the + * comparator and hasher functions support it). + * @new_value: The value to be associated with the key. + * @update: Whether to overwrite an existing value. + * @old_value_ptr: A pointer in which to store either the old value (if the + * key was already mapped) or NULL if the map did not contain + * the key; NULL may be provided if the caller does not need + * to know the old value. + * + * If the map already contains a mapping for the provided key, the old value + * is only replaced with the specified value if update is true. In either case + * the old value is returned. If the map does not already contain a value for + * the specified key, the new value is added regardless of the value of + * update. + * + * If the value stored in the map is updated, then the key stored in the map + * will also be updated with the key provided by this call. The old key will + * not be returned due to the memory managment assumptions described in the + * interface header comment. + * + * Return: UDS_SUCCESS or an error code. + */ +int pointer_map_put(struct pointer_map *map, + const void *key, + void *new_value, + bool update, + void **old_value_ptr) +{ + struct bucket *neighborhood, *bucket; + + if (new_value == NULL) { + return UDS_INVALID_ARGUMENT; + } + + /* + * Select the bucket at the start of the neighborhood that must contain + * any entry for the provided key. + */ + neighborhood = select_bucket(map, key); + + /* + * Check whether the neighborhood already contains an entry for the key, + * in which case we optionally update it, returning the old value. + */ + if (update_mapping(map, neighborhood, key, new_value, update, + old_value_ptr)) { + return UDS_SUCCESS; + } + + /* + * Find an empty bucket in the desired neighborhood for the new entry or + * re-arrange entries in the map so there is such a bucket. This + * operation will usually succeed; the loop body will only be executed + * on the rare occasions that we have to resize the map. + */ + while ((bucket = find_or_make_vacancy(map, neighborhood)) == NULL) { + /* + * There is no empty bucket in which to put the new entry in the + * current map, so we're forced to allocate a new bucket array + * with a larger capacity, re-hash all the entries into those + * buckets, and try again (a very expensive operation for large + * maps). + */ + int result = resize_buckets(map); + + if (result != UDS_SUCCESS) { + return result; + } + + /* + * Resizing the map invalidates all pointers to buckets, so + * recalculate the neighborhood pointer. + */ + neighborhood = select_bucket(map, key); + } + + /* Put the new entry in the empty bucket, adding it to the neighborhood. */ + bucket->key = key; + bucket->value = new_value; + insert_in_hop_list(neighborhood, bucket); + map->size += 1; + + /* + * There was no existing entry, so there was no old value to be + * returned. + */ + if (old_value_ptr != NULL) { + *old_value_ptr = NULL; + } + return UDS_SUCCESS; +} + +/** + * pointer_map_remove() - Remove the mapping for a given key from the + * pointer_map. + * @map: The pointer_map from which to remove the mapping. + * @key: The key whose mapping is to be removed (may be NULL if the comparator + * and hasher functions support it). + * + * Return: the value that was associated with the key, or NULL if it was not + * mapped. + */ +void *pointer_map_remove(struct pointer_map *map, const void *key) +{ + void *value; + + /* Select the bucket to search and search it for an existing entry. */ + struct bucket *bucket = select_bucket(map, key); + struct bucket *previous; + struct bucket *victim = search_hop_list(map, bucket, key, &previous); + + if (victim == NULL) { + /* There is no matching entry to remove. */ + return NULL; + } + + /* + * We found an entry to remove. Save the mapped value to return later + * and empty the bucket. + */ + map->size -= 1; + value = victim->value; + victim->value = NULL; + victim->key = 0; + + /* + * The victim bucket is now empty, but it still needs to be spliced out + * of the hop list. + */ + if (previous == NULL) { + /* The victim is the head of the list, so swing first_hop. */ + bucket->first_hop = victim->next_hop; + } else { + previous->next_hop = victim->next_hop; + } + victim->next_hop = NULL_HOP_OFFSET; + + return value; +} diff --git a/vdo/pointer-map.h b/vdo/pointer-map.h new file mode 100644 index 00000000..623bef15 --- /dev/null +++ b/vdo/pointer-map.h @@ -0,0 +1,86 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright Red Hat + */ + +#ifndef POINTER_MAP_H +#define POINTER_MAP_H + +#include "compiler.h" +#include "type-defs.h" + +/* + * A pointer_map associates pointer values (void *) with the data + * referenced by pointer keys (void *). NULL pointer + * values are not supported. A NULL key value is supported when + * the instance's key comparator and hasher functions support it. + * + * The map is implemented as hash table, which should provide constant-time + * insert, query, and remove operations, although the insert may occasionally + * grow the table, which is linear in the number of entries in the map. The + * table will grow as needed to hold new entries, but will not shrink as + * entries are removed. + * + * The key and value pointers passed to the map are retained and used by the + * map, but are not owned by the map. Freeing the map does not attempt to free + * the pointers. The client is entirely responsible for the memory managment + * of the keys and values. The current interface and implementation assume + * that keys will be properties of the values, or that keys will not be memory + * managed, or that keys will not need to be freed as a result of being + * replaced when a key is re-mapped. + */ + +struct pointer_map; + +/** + * typedef pointer_key_comparator - The prototype of functions that compare + * the referents of two pointer keys for + * equality. + * @this_key: The first element to compare. + * @that_key: The second element to compare. + * + * If two keys are equal, then both keys must have the same the hash code + * associated with them by the hasher function defined below. + * + * Return: true if and only if the referents of the two key pointers are to be + * treated as the same key by the map. + */ +typedef bool pointer_key_comparator(const void *this_key, const void *that_key); + +/** + * typedef pointer_key_hasher - The prototype of functions that get or + * calculate a hash code associated with the + * referent of pointer key. + * @key: The pointer key to hash. + * + * The hash code must be uniformly distributed over all uint32_t values. The + * hash code associated with a given key must not change while the key is in + * the map. If the comparator function says two keys are equal, then this + * function must return the same hash code for both keys. This function may be + * called many times for a key while an entry is stored for it in the map. + * + * Return: The hash code for the key. + */ +typedef uint32_t pointer_key_hasher(const void *key); + +int __must_check make_pointer_map(size_t initial_capacity, + unsigned int initial_load, + pointer_key_comparator comparator, + pointer_key_hasher hasher, + struct pointer_map **map_ptr); + +void free_pointer_map(struct pointer_map *map); + +size_t pointer_map_size(const struct pointer_map *map); + +void *pointer_map_get(struct pointer_map *map, const void *key); + +int __must_check pointer_map_put(struct pointer_map *map, + const void *key, + void *new_value, + bool update, + void **old_value_ptr); + +void *pointer_map_remove(struct pointer_map *map, const void *key); + +#endif /* POINTER_MAP_H */ diff --git a/vdo/pointerMap.c b/vdo/pointerMap.c deleted file mode 100644 index 7ba443ce..00000000 --- a/vdo/pointerMap.c +++ /dev/null @@ -1,665 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/pointerMap.c#11 $ - */ - -/** - * Hash table implementation of a map from integers to pointers, implemented - * using the Hopscotch Hashing algorithm by Herlihy, Shavit, and Tzafrir (see - * http://en.wikipedia.org/wiki/Hopscotch_hashing). This implementation does - * not contain any of the locking/concurrency features of the algorithm, just - * the collision resolution scheme. - * - * Hopscotch Hashing is based on hashing with open addressing and linear - * probing. All the entries are stored in a fixed array of buckets, with no - * dynamic allocation for collisions. Unlike linear probing, all the entries - * that hash to a given bucket are stored within a fixed neighborhood starting - * at that bucket. Chaining is effectively represented as a bit vector - * relative to each bucket instead of as pointers or explicit offsets. - * - * When an empty bucket cannot be found within a given neighborhood, - * subsequent neighborhoods are searched, and one or more entries will "hop" - * into those neighborhoods. When this process works, an empty bucket will - * move into the desired neighborhood, allowing the entry to be added. When - * that process fails (typically when the buckets are around 90% full), the - * table must be resized and the all entries rehashed and added to the - * expanded table. - * - * Unlike linear probing, the number of buckets that must be searched in the - * worst case has a fixed upper bound (the size of the neighborhood). Those - * entries occupy a small number of memory cache lines, leading to improved - * use of the cache (fewer misses on both successful and unsuccessful - * searches). Hopscotch hashing outperforms linear probing at much higher load - * factors, so even with the increased memory burden for maintaining the hop - * vectors, less memory is needed to achieve that performance. Hopscotch is - * also immune to "contamination" from deleting entries since entries are - * genuinely removed instead of being replaced by a placeholder. - * - * The published description of the algorithm used a bit vector, but the paper - * alludes to an offset scheme which is used by this implementation. Since the - * entries in the neighborhood are within N entries of the hash bucket at the - * start of the neighborhood, a pair of small offset fields each log2(N) bits - * wide is all that's needed to maintain the hops as a linked list. In order - * to encode "no next hop" (i.e. NULL) as the natural initial value of zero, - * the offsets are biased by one (i.e. 0 => NULL, 1 => offset=0, 2 => - * offset=1, etc.) We can represent neighborhoods of up to 255 entries with - * just 8+8=16 bits per entry. The hop list is sorted by hop offset so the - * first entry in the list is always the bucket closest to the start of the - * neighborhood. - * - * While individual accesses tend to be very fast, the table resize operations - * are very very expensive. If an upper bound on the latency of adding an - * entry to the table is needed, we either need to ensure the table is - * pre-sized to be large enough so no resize is ever needed, or we'll need to - * develop an approach to incrementally resize the table. - **/ - -#include "pointerMap.h" - -#include "errors.h" -#include "logger.h" -#include "memoryAlloc.h" -#include "numeric.h" -#include "permassert.h" - -enum { - DEFAULT_CAPACITY = 16, // the number of neighborhoods in a new table - NEIGHBORHOOD = 255, // the number of buckets in each neighborhood - MAX_PROBES = 1024, // limit on the number of probes for a free bucket - NULL_HOP_OFFSET = 0, // the hop offset value terminating the hop list - DEFAULT_LOAD = 75 // a compromise between memory use and performance -}; - -/** - * Buckets are packed together to reduce memory usage and improve cache - * efficiency. It would be tempting to encode the hop offsets separately and - * maintain alignment of key/value pairs, but it's crucial to keep the hop - * fields near the buckets that they use them so they'll tend to share cache - * lines. - **/ -struct __packed bucket { - uint8_t first_hop; // the biased offset of the first entry in the hop - // list of the neighborhood that hashes to this - // bucket - uint8_t next_hop; // the biased offset of the next bucket in the hop - // list - const void *key; // the key stored in this bucket - void *value; // the value stored in this bucket (NULL if empty) -}; - -/** - * The concrete definition of the opaque pointer_map type. To avoid having to - * wrap the neighborhoods of the last entries back around to the start of the - * bucket array, we allocate a few more buckets at the end of the array - * instead, which is why capacity and bucket_count are different. - **/ -struct pointer_map { - /** the number of entries stored in the map */ - size_t size; - /** the number of neighborhoods in the map */ - size_t capacity; - /** the number of buckets in the bucket array */ - size_t bucket_count; - /** the array of hash buckets */ - struct bucket *buckets; - /** the function for comparing keys for equality */ - pointer_key_comparator *comparator; - /** the function for getting a hash code from a key */ - pointer_key_hasher *hasher; -}; - -/** - * Initialize a pointer_map. - * - * @param map the map to initialize - * @param capacity the initial capacity of the map - * - * @return UDS_SUCCESS or an error code - **/ -static int allocate_buckets(struct pointer_map *map, size_t capacity) -{ - map->size = 0; - map->capacity = capacity; - - // Allocate NEIGHBORHOOD - 1 extra buckets so the last bucket can have a - // full neighborhood without have to wrap back around to element zero. - map->bucket_count = capacity + (NEIGHBORHOOD - 1); - return UDS_ALLOCATE(map->bucket_count, - struct bucket, - "pointer_map buckets", - &map->buckets); -} - -/**********************************************************************/ -int make_pointer_map(size_t initial_capacity, - unsigned int initial_load, - pointer_key_comparator comparator, - pointer_key_hasher hasher, - struct pointer_map **map_ptr) -{ - int result; - struct pointer_map *map; - size_t capacity; - - // Use the default initial load if the caller did not specify one. - if (initial_load == 0) { - initial_load = DEFAULT_LOAD; - } - if (initial_load > 100) { - return UDS_INVALID_ARGUMENT; - } - - result = UDS_ALLOCATE(1, struct pointer_map, "pointer_map", &map); - if (result != UDS_SUCCESS) { - return result; - } - - map->hasher = hasher; - map->comparator = comparator; - - // Use the default capacity if the caller did not specify one. - capacity = - (initial_capacity > 0) ? initial_capacity : DEFAULT_CAPACITY; - - // Scale up the capacity by the specified initial load factor. - // (i.e to hold 1000 entries at 80% load we need a capacity of 1250) - capacity = capacity * 100 / initial_load; - - result = allocate_buckets(map, capacity); - if (result != UDS_SUCCESS) { - free_pointer_map(UDS_FORGET(map)); - return result; - } - - *map_ptr = map; - return UDS_SUCCESS; -} - -/**********************************************************************/ -void free_pointer_map(struct pointer_map *map) -{ - if (map == NULL) { - return; - } - - UDS_FREE(UDS_FORGET(map->buckets)); - UDS_FREE(UDS_FORGET(map)); -} - -/**********************************************************************/ -size_t pointer_map_size(const struct pointer_map *map) -{ - return map->size; -} - -/** - * Convert a biased hop offset within a neighborhood to a pointer to the - * bucket it references. - * - * @param neighborhood the first bucket in the neighborhood - * @param hop_offset the biased hop offset to the desired bucket - * - * @return NULL if hop_offset is zero, otherwise a pointer to - * the bucket in the neighborhood at hop_offset - 1 - **/ -static struct bucket *dereference_hop(struct bucket *neighborhood, - unsigned int hop_offset) -{ - if (hop_offset == NULL_HOP_OFFSET) { - return NULL; - } - - STATIC_ASSERT(NULL_HOP_OFFSET == 0); - return &neighborhood[hop_offset - 1]; -} - -/** - * Add a bucket into the hop list for the neighborhood, inserting it into the - * list so the hop list remains sorted by hop offset. - * - * @param neighborhood the first bucket in the neighborhood - * @param new_bucket the bucket to add to the hop list - **/ -static void insert_in_hop_list(struct bucket *neighborhood, - struct bucket *new_bucket) -{ - // Zero indicates a NULL hop offset, so bias the hop offset by one. - int hop_offset = 1 + (new_bucket - neighborhood); - - // Handle the special case of adding a bucket at the start of the list. - int next_hop = neighborhood->first_hop; - if ((next_hop == NULL_HOP_OFFSET) || (next_hop > hop_offset)) { - new_bucket->next_hop = next_hop; - neighborhood->first_hop = hop_offset; - return; - } - - // Search the hop list for the insertion point that maintains the sort - // order. - for (;;) { - struct bucket *bucket = dereference_hop(neighborhood, next_hop); - next_hop = bucket->next_hop; - - if ((next_hop == NULL_HOP_OFFSET) || (next_hop > hop_offset)) { - new_bucket->next_hop = next_hop; - bucket->next_hop = hop_offset; - return; - } - } -} - -/** - * Select and return the hash bucket for a given search key. - * - * @param map the map to search - * @param key the mapping key - **/ -static struct bucket *select_bucket(const struct pointer_map *map, - const void *key) -{ - /* - * Scale the 32-bit hash to a bucket index by treating it as a binary - * fraction and multiplying that by the capacity. If the hash is - * uniformly distributed over [0 .. 2^32-1], then (hash * capacity / - * 2^32) should be uniformly distributed over [0 .. capacity-1]. The - * multiply and shift is much faster than a divide (modulus) on X86 - * CPUs. - */ - uint64_t hash = map->hasher(key); - return &map->buckets[(hash * map->capacity) >> 32]; -} - -/** - * Search the hop list associated with given hash bucket for a given search - * key. If the key is found, returns a pointer to the entry (bucket or - * collision), otherwise returns NULL. - * - * @param [in] map the map being searched - * @param [in] bucket the map bucket to search for the key - * @param [in] key the mapping key - * @param [out] previous_ptr if not NULL, a pointer in which to - * store the bucket in the list preceding the one - * that had the matching key - * - * @return an entry that matches the key, or NULL if not found - **/ -static struct bucket *search_hop_list(struct pointer_map *map, - struct bucket *bucket, - const void *key, - struct bucket **previous_ptr) -{ - struct bucket *previous = NULL; - unsigned int next_hop = bucket->first_hop; - while (next_hop != NULL_HOP_OFFSET) { - // Check the neighboring bucket indexed by the offset for the - // desired key. - struct bucket *entry = dereference_hop(bucket, next_hop); - if ((entry->value != NULL) && - map->comparator(key, entry->key)) { - if (previous_ptr != NULL) { - *previous_ptr = previous; - } - return entry; - } - next_hop = entry->next_hop; - previous = entry; - } - return NULL; -} - -/**********************************************************************/ -void *pointer_map_get(struct pointer_map *map, const void *key) -{ - struct bucket *match = - search_hop_list(map, select_bucket(map, key), key, NULL); - return ((match != NULL) ? match->value : NULL); -} - -/** - * Increase the number of hash buckets and rehash all the existing entries, - * storing them in the new buckets. - * - * @param map the map to resize - **/ -static int resize_buckets(struct pointer_map *map) -{ - int result; - size_t i; - - // Copy the top-level map data to the stack. - struct pointer_map old_map = *map; - - // Re-initialize the map to be empty and 50% larger. - size_t new_capacity = map->capacity / 2 * 3; - uds_log_info("%s: attempting resize from %zu to %zu, current size=%zu", - __func__, - map->capacity, - new_capacity, - map->size); - result = allocate_buckets(map, new_capacity); - if (result != UDS_SUCCESS) { - *map = old_map; - return result; - } - - // Populate the new hash table from the entries in the old bucket array. - for (i = 0; i < old_map.bucket_count; i++) { - struct bucket *entry = &old_map.buckets[i]; - if (entry->value == NULL) { - continue; - } - - result = pointer_map_put(map, entry->key, entry->value, - true, NULL); - if (result != UDS_SUCCESS) { - // Destroy the new partial map and restore the map from - // the stack. - UDS_FREE(UDS_FORGET(map->buckets)); - *map = old_map; - return result; - } - } - - // Destroy the old bucket array. - UDS_FREE(UDS_FORGET(old_map.buckets)); - return UDS_SUCCESS; -} - -/** - * Probe the bucket array starting at the given bucket for the next empty - * bucket, returning a pointer to it. NULL will be returned if - * the search reaches the end of the bucket array or if the number of linear - * probes exceeds a specified limit. - * - * @param map the map containing the buckets to search - * @param bucket the bucket at which to start probing - * @param max_probes the maximum number of buckets to search - * - * @return the next empty bucket, or NULL if the search failed - **/ -static struct bucket *find_empty_bucket(struct pointer_map *map, - struct bucket *bucket, - unsigned int max_probes) -{ - // Limit the search to either the nearer of the end of the bucket array - // or a fixed distance beyond the initial bucket. - ptrdiff_t remaining = &map->buckets[map->bucket_count] - bucket; - struct bucket *sentinel = - &bucket[min(remaining, (ptrdiff_t) max_probes)]; - - struct bucket *entry; - for (entry = bucket; entry < sentinel; entry++) { - if (entry->value == NULL) { - return entry; - } - } - return NULL; -} - -/** - * Move an empty bucket closer to the start of the bucket array. This searches - * the neighborhoods that contain the empty bucket for a non-empty bucket - * closer to the start of the array. If such a bucket is found, this swaps the - * two buckets by moving the entry to the empty bucket. - * - * @param map the map containing the bucket - * @param hole the empty bucket to fill with an entry that precedes it in one - * of its enclosing neighborhoods - * - * @return the bucket that was vacated by moving its entry to the provided - * hole, or NULL if no entry could be moved - **/ -static struct bucket *move_empty_bucket(struct pointer_map *map - __attribute__((unused)), - struct bucket *hole) -{ - /* - * Examine every neighborhood that the empty bucket is part of, starting - * with the one in which it is the last bucket. No boundary check is - * needed for the negative array arithmetic since this function is only - * called when hole is at least NEIGHBORHOOD cells deeper into the array - * than a valid bucket. - */ - struct bucket *bucket; - for (bucket = &hole[1 - NEIGHBORHOOD]; bucket < hole; bucket++) { - // Find the entry that is nearest to the bucket, which means it - // will be nearest to the hash bucket whose neighborhood is - // full. - struct bucket *new_hole = - dereference_hop(bucket, bucket->first_hop); - if (new_hole == NULL) { - // There are no buckets in this neighborhood that are in - // use by this one (they must all be owned by - // overlapping neighborhoods). - continue; - } - - // Skip this bucket if its first entry is actually further away - // than the hole that we're already trying to fill. - if (hole < new_hole) { - continue; - } - - /* - * We've found an entry in this neighborhood that we can "hop" - * further away, moving the hole closer to the hash bucket, if - * not all the way into its neighborhood. - */ - - // The entry that will be the new hole is the first bucket in - // the list, so setting first_hop is all that's needed remove it - // from the list. - bucket->first_hop = new_hole->next_hop; - new_hole->next_hop = NULL_HOP_OFFSET; - - // Move the entry into the original hole. - hole->key = new_hole->key; - hole->value = new_hole->value; - new_hole->value = NULL; - - // Insert the filled hole into the hop list for the - // neighborhood. - insert_in_hop_list(bucket, hole); - return new_hole; - } - - // We couldn't find an entry to relocate to the hole. - return NULL; -} - -/** - * Find and update any existing mapping for a given key, returning the value - * associated with the key in the provided pointer. - * - * @param [in] map the pointer_map to attempt to modify - * @param [in] neighborhood the first bucket in the neighborhood that - * would contain the search key - * @param [in] key the key with which to associate the new value - * @param [in] new_value the value to be associated with the key - * @param [in] update whether to overwrite an existing value - * @param [out] old_value_ptr a pointer in which to store the old value - * (unmodified if no mapping was found) - * - * @return true if the map contains a mapping for the key - * false if it does not - **/ -static bool update_mapping(struct pointer_map *map, - struct bucket *neighborhood, - const void *key, - void *new_value, - bool update, - void **old_value_ptr) -{ - struct bucket *bucket = search_hop_list(map, neighborhood, key, NULL); - if (bucket == NULL) { - // There is no bucket containing the key in the neighborhood. - return false; - } - - // Return the value of the current mapping (if desired) and update the - // mapping with the new value (if desired). - if (old_value_ptr != NULL) { - *old_value_ptr = bucket->value; - } - if (update) { - // We're dropping the old key pointer on the floor here, - // assuming it's a property of the value or that it's otherwise - // safe to just forget. - bucket->key = key; - bucket->value = new_value; - } - return true; -} - -/** - * Find an empty bucket in a specified neighborhood for a new mapping or - * attempt to re-arrange mappings so there is such a bucket. This operation - * may fail (returning NULL) if an empty bucket is not available or could not - * be relocated to the neighborhood. - * - * @param map the pointer_map to search or modify - * @param neighborhood the first bucket in the neighborhood in which - * an empty bucket is needed for a new mapping - * - * @return a pointer to an empty bucket in the desired neighborhood, or - * NULL if a vacancy could not be found or arranged - **/ -static struct bucket *find_or_make_vacancy(struct pointer_map *map, - struct bucket *neighborhood) -{ - // Probe within and beyond the neighborhood for the first empty bucket. - struct bucket *hole = find_empty_bucket(map, neighborhood, MAX_PROBES); - - // Keep trying until the empty bucket is in the bucket's neighborhood or - // we are unable to move it any closer by swapping it with a filled - // bucket. - while (hole != NULL) { - int distance = hole - neighborhood; - if (distance < NEIGHBORHOOD) { - // We've found or relocated an empty bucket close enough - // to the initial hash bucket to be referenced by its - // hop vector. - return hole; - } - - // The nearest empty bucket isn't within the neighborhood that - // must contain the new entry, so try to swap it with bucket - // that is closer. - hole = move_empty_bucket(map, hole); - } - - return NULL; -} - -/**********************************************************************/ -int pointer_map_put(struct pointer_map *map, - const void *key, - void *new_value, - bool update, - void **old_value_ptr) -{ - struct bucket *neighborhood, *bucket; - - if (new_value == NULL) { - return UDS_INVALID_ARGUMENT; - } - - // Select the bucket at the start of the neighborhood that must contain - // any entry for the provided key. - neighborhood = select_bucket(map, key); - - // Check whether the neighborhood already contains an entry for the key, - // in which case we optionally update it, returning the old value. - if (update_mapping(map, neighborhood, key, new_value, update, - old_value_ptr)) { - return UDS_SUCCESS; - } - - /* - * Find an empty bucket in the desired neighborhood for the new entry or - * re-arrange entries in the map so there is such a bucket. This - * operation will usually succeed; the loop body will only be executed - * on the rare occasions that we have to resize the map. - */ - while ((bucket = find_or_make_vacancy(map, neighborhood)) == NULL) { - /* - * There is no empty bucket in which to put the new entry in the - * current map, so we're forced to allocate a new bucket array - * with a larger capacity, re-hash all the entries into those - * buckets, and try again (a very expensive operation for large - * maps). - */ - int result = resize_buckets(map); - if (result != UDS_SUCCESS) { - return result; - } - - // Resizing the map invalidates all pointers to buckets, so - // recalculate the neighborhood pointer. - neighborhood = select_bucket(map, key); - } - - // Put the new entry in the empty bucket, adding it to the neighborhood. - bucket->key = key; - bucket->value = new_value; - insert_in_hop_list(neighborhood, bucket); - map->size += 1; - - // There was no existing entry, so there was no old value to be - // returned. - if (old_value_ptr != NULL) { - *old_value_ptr = NULL; - } - return UDS_SUCCESS; -} - -/**********************************************************************/ -void *pointer_map_remove(struct pointer_map *map, const void *key) -{ - void *value; - - // Select the bucket to search and search it for an existing entry. - struct bucket *bucket = select_bucket(map, key); - struct bucket *previous; - struct bucket *victim = search_hop_list(map, bucket, key, &previous); - - if (victim == NULL) { - // There is no matching entry to remove. - return NULL; - } - - // We found an entry to remove. Save the mapped value to return later - // and empty the bucket. - map->size -= 1; - value = victim->value; - victim->value = NULL; - victim->key = 0; - - // The victim bucket is now empty, but it still needs to be spliced out - // of the hop list. - if (previous == NULL) { - // The victim is the head of the list, so swing first_hop. - bucket->first_hop = victim->next_hop; - } else { - previous->next_hop = victim->next_hop; - } - victim->next_hop = NULL_HOP_OFFSET; - - return value; -} diff --git a/vdo/pointerMap.h b/vdo/pointerMap.h deleted file mode 100644 index f64282c7..00000000 --- a/vdo/pointerMap.h +++ /dev/null @@ -1,175 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/pointerMap.h#7 $ - */ - -#ifndef POINTER_MAP_H -#define POINTER_MAP_H - -#include "common.h" - -/** - * A pointer_map associates pointer values (void *) with the data - * referenced by pointer keys (void *). NULL pointer - * values are not supported. A NULL key value is supported when - * the instance's key comparator and hasher functions support it. - * - * The map is implemented as hash table, which should provide constant-time - * insert, query, and remove operations, although the insert may occasionally - * grow the table, which is linear in the number of entries in the map. The - * table will grow as needed to hold new entries, but will not shrink as - * entries are removed. - * - * The key and value pointers passed to the map are retained and used by the - * map, but are not owned by the map. Freeing the map does not attempt to free - * the pointers. The client is entirely responsible for the memory managment - * of the keys and values. The current interface and implementation assume - * that keys will be properties of the values, or that keys will not be memory - * managed, or that keys will not need to be freed as a result of being - * replaced when a key is re-mapped. - **/ - -struct pointer_map; - -/** - * The prototype of functions that compare the referents of two pointer keys - * for equality. If two keys are equal, then both keys must have the same the - * hash code associated with them by the hasher function defined below. - - * @param this_key The first element to compare - * @param that_key The second element to compare - * - * @return true if and only if the referents of the two - * key pointers are to be treated as the same key by the map - **/ -typedef bool pointer_key_comparator(const void *this_key, const void *that_key); - -/** - * The prototype of functions that get or calculate a hash code associated - * with the referent of pointer key. The hash code must be uniformly - * distributed over all uint32_t values. The hash code associated with a given - * key must not change while the key is in the map. If the comparator function - * says two keys are equal, then this function must return the same hash code - * for both keys. This function may be called many times for a key while an - * entry is stored for it in the map. - * - * @param key The pointer key to hash - * - * @return the hash code for the key - **/ -typedef uint32_t pointer_key_hasher(const void *key); - -/** - * Allocate and initialize a pointer_map. - * - * @param [in] initial_capacity The number of entries the map should - * initially be capable of holding (zero tells - * the map to use its own small default) - * @param [in] initial_load The load factor of the map, expressed as an - * integer percentage (typically in the range - * 50 to 90, with zero telling the map to use - * its own default) - * @param [in] comparator The function to use to compare the referents - * of two pointer keys for equality - * @param [in] hasher The function to use obtain the hash code - * associated with each pointer key - * @param [out] map_ptr A pointer to hold the new pointer_map - * - * @return UDS_SUCCESS or an error code - **/ -int __must_check make_pointer_map(size_t initial_capacity, - unsigned int initial_load, - pointer_key_comparator comparator, - pointer_key_hasher hasher, - struct pointer_map **map_ptr); - -/** - * Free a pointer_map. NOTE: The map does not own the pointer keys and values - * stored in the map and they are not freed by this call. - * - * @param map The pointer_map to free - **/ -void free_pointer_map(struct pointer_map *map); - -/** - * Get the number of entries stored in a pointer_map. - * - * @param map The pointer_map to query - * - * @return the number of entries in the map - **/ -size_t pointer_map_size(const struct pointer_map *map); - -/** - * Retrieve the value associated with a given key from the pointer_map. - * - * @param map The pointer_map to query - * @param key The key to look up (may be NULL if the - * comparator and hasher functions support it) - * - * @return the value associated with the given key, or NULL - * if the key is not mapped to any value - **/ -void *pointer_map_get(struct pointer_map *map, const void *key); - -/** - * Try to associate a value (a pointer) with an integer in a pointer_map. - * If the map already contains a mapping for the provided key, the old value is - * only replaced with the specified value if update is true. In either case - * the old value is returned. If the map does not already contain a value for - * the specified key, the new value is added regardless of the value of update. - * - * If the value stored in the map is updated, then the key stored in the map - * will also be updated with the key provided by this call. The old key will - * not be returned due to the memory managment assumptions described in the - * interface header comment. - * - * @param [in] map The pointer_map to attempt to modify - * @param [in] key The key with which to associate the new value - * (may be NULL if the comparator and - * hasher functions support it) - * @param [in] new_value The value to be associated with the key - * @param [in] update Whether to overwrite an existing value - * @param [out] old_value_ptr A pointer in which to store either the old value - * (if the key was already mapped) or - * NULL if the map did not contain the - * key; NULL may be provided if the - * caller does not need to know the old value - * - * @return UDS_SUCCESS or an error code - **/ -int __must_check pointer_map_put(struct pointer_map *map, - const void *key, - void *new_value, - bool update, - void **old_value_ptr); - -/** - * Remove the mapping for a given key from the pointer_map. - * - * @param map The pointer_map from which to remove the mapping - * @param key The key whose mapping is to be removed (may be NULL - * if the comparator and hasher functions support it) - * - * @return the value that was associated with the key, or - * NULL if it was not mapped - **/ -void *pointer_map_remove(struct pointer_map *map, const void *key); - -#endif /* POINTER_MAP_H */ diff --git a/vdo/poolSysfsStats.c b/vdo/pool-sysfs-stats.c similarity index 77% rename from vdo/poolSysfsStats.c rename to vdo/pool-sysfs-stats.c index f927f184..9d1a5fd5 100644 --- a/vdo/poolSysfsStats.c +++ b/vdo/pool-sysfs-stats.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright Red Hat * @@ -17,11 +18,13 @@ * 02110-1301, USA. */ -#include "dedupeIndex.h" +#include + #include "logger.h" -#include "poolSysfs.h" + +#include "dedupe-index.h" +#include "pool-sysfs.h" #include "statistics.h" -#include "threadDevice.h" #include "vdo.h" struct pool_stats_attribute { @@ -43,7 +46,7 @@ static ssize_t pool_stats_attr_show(struct kobject *directory, } mutex_lock(&vdo->stats_mutex); - fetch_vdo_statistics(vdo, &vdo->stats_buffer); + vdo_fetch_statistics(vdo, &vdo->stats_buffer); size = pool_stats_attr->print(&vdo->stats_buffer, buf); mutex_unlock(&vdo->stats_mutex); @@ -55,8 +58,7 @@ struct sysfs_ops vdo_pool_stats_sysfs_ops = { .store = NULL, }; -/**********************************************************************/ -/** Number of blocks used for data */ +/* Number of blocks used for data */ static ssize_t pool_stats_print_data_blocks_used(struct vdo_statistics *stats, char *buf) { @@ -68,8 +70,7 @@ static struct pool_stats_attribute pool_stats_attr_data_blocks_used = { .print = pool_stats_print_data_blocks_used, }; -/**********************************************************************/ -/** Number of blocks used for VDO metadata */ +/* Number of blocks used for VDO metadata */ static ssize_t pool_stats_print_overhead_blocks_used(struct vdo_statistics *stats, char *buf) { @@ -81,8 +82,7 @@ static struct pool_stats_attribute pool_stats_attr_overhead_blocks_used = { .print = pool_stats_print_overhead_blocks_used, }; -/**********************************************************************/ -/** Number of logical blocks that are currently mapped to physical blocks */ +/* Number of logical blocks that are currently mapped to physical blocks */ static ssize_t pool_stats_print_logical_blocks_used(struct vdo_statistics *stats, char *buf) { @@ -94,8 +94,7 @@ static struct pool_stats_attribute pool_stats_attr_logical_blocks_used = { .print = pool_stats_print_logical_blocks_used, }; -/**********************************************************************/ -/** number of physical blocks */ +/* number of physical blocks */ static ssize_t pool_stats_print_physical_blocks(struct vdo_statistics *stats, char *buf) { @@ -107,8 +106,7 @@ static struct pool_stats_attribute pool_stats_attr_physical_blocks = { .print = pool_stats_print_physical_blocks, }; -/**********************************************************************/ -/** number of logical blocks */ +/* number of logical blocks */ static ssize_t pool_stats_print_logical_blocks(struct vdo_statistics *stats, char *buf) { @@ -120,8 +118,7 @@ static struct pool_stats_attribute pool_stats_attr_logical_blocks = { .print = pool_stats_print_logical_blocks, }; -/**********************************************************************/ -/** Size of the block map page cache, in bytes */ +/* Size of the block map page cache, in bytes */ static ssize_t pool_stats_print_block_map_cache_size(struct vdo_statistics *stats, char *buf) { @@ -133,8 +130,7 @@ static struct pool_stats_attribute pool_stats_attr_block_map_cache_size = { .print = pool_stats_print_block_map_cache_size, }; -/**********************************************************************/ -/** The physical block size */ +/* The physical block size */ static ssize_t pool_stats_print_block_size(struct vdo_statistics *stats, char *buf) { @@ -146,8 +142,7 @@ static struct pool_stats_attribute pool_stats_attr_block_size = { .print = pool_stats_print_block_size, }; -/**********************************************************************/ -/** Number of times the VDO has successfully recovered */ +/* Number of times the VDO has successfully recovered */ static ssize_t pool_stats_print_complete_recoveries(struct vdo_statistics *stats, char *buf) { @@ -159,8 +154,7 @@ static struct pool_stats_attribute pool_stats_attr_complete_recoveries = { .print = pool_stats_print_complete_recoveries, }; -/**********************************************************************/ -/** Number of times the VDO has recovered from read-only mode */ +/* Number of times the VDO has recovered from read-only mode */ static ssize_t pool_stats_print_read_only_recoveries(struct vdo_statistics *stats, char *buf) { @@ -172,8 +166,7 @@ static struct pool_stats_attribute pool_stats_attr_read_only_recoveries = { .print = pool_stats_print_read_only_recoveries, }; -/**********************************************************************/ -/** String describing the operating mode of the VDO */ +/* String describing the operating mode of the VDO */ static ssize_t pool_stats_print_mode(struct vdo_statistics *stats, char *buf) { @@ -185,8 +178,7 @@ static struct pool_stats_attribute pool_stats_attr_mode = { .print = pool_stats_print_mode, }; -/**********************************************************************/ -/** Whether the VDO is in recovery mode */ +/* Whether the VDO is in recovery mode */ static ssize_t pool_stats_print_in_recovery_mode(struct vdo_statistics *stats, char *buf) { @@ -198,8 +190,7 @@ static struct pool_stats_attribute pool_stats_attr_in_recovery_mode = { .print = pool_stats_print_in_recovery_mode, }; -/**********************************************************************/ -/** What percentage of recovery mode work has been completed */ +/* What percentage of recovery mode work has been completed */ static ssize_t pool_stats_print_recovery_percentage(struct vdo_statistics *stats, char *buf) { @@ -211,8 +202,7 @@ static struct pool_stats_attribute pool_stats_attr_recovery_percentage = { .print = pool_stats_print_recovery_percentage, }; -/**********************************************************************/ -/** Number of compressed data items written since startup */ +/* Number of compressed data items written since startup */ static ssize_t pool_stats_print_packer_compressed_fragments_written(struct vdo_statistics *stats, char *buf) { @@ -224,8 +214,7 @@ static struct pool_stats_attribute pool_stats_attr_packer_compressed_fragments_w .print = pool_stats_print_packer_compressed_fragments_written, }; -/**********************************************************************/ -/** Number of blocks containing compressed items written since startup */ +/* Number of blocks containing compressed items written since startup */ static ssize_t pool_stats_print_packer_compressed_blocks_written(struct vdo_statistics *stats, char *buf) { @@ -237,8 +226,7 @@ static struct pool_stats_attribute pool_stats_attr_packer_compressed_blocks_writ .print = pool_stats_print_packer_compressed_blocks_written, }; -/**********************************************************************/ -/** Number of VIOs that are pending in the packer */ +/* Number of VIOs that are pending in the packer */ static ssize_t pool_stats_print_packer_compressed_fragments_in_packer(struct vdo_statistics *stats, char *buf) { @@ -250,8 +238,7 @@ static struct pool_stats_attribute pool_stats_attr_packer_compressed_fragments_i .print = pool_stats_print_packer_compressed_fragments_in_packer, }; -/**********************************************************************/ -/** The total number of slabs from which blocks may be allocated */ +/* The total number of slabs from which blocks may be allocated */ static ssize_t pool_stats_print_allocator_slab_count(struct vdo_statistics *stats, char *buf) { @@ -263,8 +250,7 @@ static struct pool_stats_attribute pool_stats_attr_allocator_slab_count = { .print = pool_stats_print_allocator_slab_count, }; -/**********************************************************************/ -/** The total number of slabs from which blocks have ever been allocated */ +/* The total number of slabs from which blocks have ever been allocated */ static ssize_t pool_stats_print_allocator_slabs_opened(struct vdo_statistics *stats, char *buf) { @@ -276,8 +262,7 @@ static struct pool_stats_attribute pool_stats_attr_allocator_slabs_opened = { .print = pool_stats_print_allocator_slabs_opened, }; -/**********************************************************************/ -/** The number of times since loading that a slab has been re-opened */ +/* The number of times since loading that a slab has been re-opened */ static ssize_t pool_stats_print_allocator_slabs_reopened(struct vdo_statistics *stats, char *buf) { @@ -289,8 +274,7 @@ static struct pool_stats_attribute pool_stats_attr_allocator_slabs_reopened = { .print = pool_stats_print_allocator_slabs_reopened, }; -/**********************************************************************/ -/** Number of times the on-disk journal was full */ +/* Number of times the on-disk journal was full */ static ssize_t pool_stats_print_journal_disk_full(struct vdo_statistics *stats, char *buf) { @@ -302,8 +286,7 @@ static struct pool_stats_attribute pool_stats_attr_journal_disk_full = { .print = pool_stats_print_journal_disk_full, }; -/**********************************************************************/ -/** Number of times the recovery journal requested slab journal commits. */ +/* Number of times the recovery journal requested slab journal commits. */ static ssize_t pool_stats_print_journal_slab_journal_commits_requested(struct vdo_statistics *stats, char *buf) { @@ -315,8 +298,7 @@ static struct pool_stats_attribute pool_stats_attr_journal_slab_journal_commits_ .print = pool_stats_print_journal_slab_journal_commits_requested, }; -/**********************************************************************/ -/** The total number of items on which processing has started */ +/* The total number of items on which processing has started */ static ssize_t pool_stats_print_journal_entries_started(struct vdo_statistics *stats, char *buf) { @@ -328,8 +310,7 @@ static struct pool_stats_attribute pool_stats_attr_journal_entries_started = { .print = pool_stats_print_journal_entries_started, }; -/**********************************************************************/ -/** The total number of items for which a write operation has been issued */ +/* The total number of items for which a write operation has been issued */ static ssize_t pool_stats_print_journal_entries_written(struct vdo_statistics *stats, char *buf) { @@ -341,8 +322,7 @@ static struct pool_stats_attribute pool_stats_attr_journal_entries_written = { .print = pool_stats_print_journal_entries_written, }; -/**********************************************************************/ -/** The total number of items for which a write operation has completed */ +/* The total number of items for which a write operation has completed */ static ssize_t pool_stats_print_journal_entries_committed(struct vdo_statistics *stats, char *buf) { @@ -354,8 +334,7 @@ static struct pool_stats_attribute pool_stats_attr_journal_entries_committed = { .print = pool_stats_print_journal_entries_committed, }; -/**********************************************************************/ -/** The total number of items on which processing has started */ +/* The total number of items on which processing has started */ static ssize_t pool_stats_print_journal_blocks_started(struct vdo_statistics *stats, char *buf) { @@ -367,8 +346,7 @@ static struct pool_stats_attribute pool_stats_attr_journal_blocks_started = { .print = pool_stats_print_journal_blocks_started, }; -/**********************************************************************/ -/** The total number of items for which a write operation has been issued */ +/* The total number of items for which a write operation has been issued */ static ssize_t pool_stats_print_journal_blocks_written(struct vdo_statistics *stats, char *buf) { @@ -380,8 +358,7 @@ static struct pool_stats_attribute pool_stats_attr_journal_blocks_written = { .print = pool_stats_print_journal_blocks_written, }; -/**********************************************************************/ -/** The total number of items for which a write operation has completed */ +/* The total number of items for which a write operation has completed */ static ssize_t pool_stats_print_journal_blocks_committed(struct vdo_statistics *stats, char *buf) { @@ -393,8 +370,7 @@ static struct pool_stats_attribute pool_stats_attr_journal_blocks_committed = { .print = pool_stats_print_journal_blocks_committed, }; -/**********************************************************************/ -/** Number of times the on-disk journal was full */ +/* Number of times the on-disk journal was full */ static ssize_t pool_stats_print_slab_journal_disk_full_count(struct vdo_statistics *stats, char *buf) { @@ -406,8 +382,7 @@ static struct pool_stats_attribute pool_stats_attr_slab_journal_disk_full_count .print = pool_stats_print_slab_journal_disk_full_count, }; -/**********************************************************************/ -/** Number of times an entry was added over the flush threshold */ +/* Number of times an entry was added over the flush threshold */ static ssize_t pool_stats_print_slab_journal_flush_count(struct vdo_statistics *stats, char *buf) { @@ -419,8 +394,7 @@ static struct pool_stats_attribute pool_stats_attr_slab_journal_flush_count = { .print = pool_stats_print_slab_journal_flush_count, }; -/**********************************************************************/ -/** Number of times an entry was added over the block threshold */ +/* Number of times an entry was added over the block threshold */ static ssize_t pool_stats_print_slab_journal_blocked_count(struct vdo_statistics *stats, char *buf) { @@ -432,8 +406,7 @@ static struct pool_stats_attribute pool_stats_attr_slab_journal_blocked_count = .print = pool_stats_print_slab_journal_blocked_count, }; -/**********************************************************************/ -/** Number of times a tail block was written */ +/* Number of times a tail block was written */ static ssize_t pool_stats_print_slab_journal_blocks_written(struct vdo_statistics *stats, char *buf) { @@ -445,8 +418,7 @@ static struct pool_stats_attribute pool_stats_attr_slab_journal_blocks_written = .print = pool_stats_print_slab_journal_blocks_written, }; -/**********************************************************************/ -/** Number of times we had to wait for the tail to write */ +/* Number of times we had to wait for the tail to write */ static ssize_t pool_stats_print_slab_journal_tail_busy_count(struct vdo_statistics *stats, char *buf) { @@ -458,8 +430,7 @@ static struct pool_stats_attribute pool_stats_attr_slab_journal_tail_busy_count .print = pool_stats_print_slab_journal_tail_busy_count, }; -/**********************************************************************/ -/** Number of blocks written */ +/* Number of blocks written */ static ssize_t pool_stats_print_slab_summary_blocks_written(struct vdo_statistics *stats, char *buf) { @@ -471,8 +442,7 @@ static struct pool_stats_attribute pool_stats_attr_slab_summary_blocks_written = .print = pool_stats_print_slab_summary_blocks_written, }; -/**********************************************************************/ -/** Number of reference blocks written */ +/* Number of reference blocks written */ static ssize_t pool_stats_print_ref_counts_blocks_written(struct vdo_statistics *stats, char *buf) { @@ -484,8 +454,7 @@ static struct pool_stats_attribute pool_stats_attr_ref_counts_blocks_written = { .print = pool_stats_print_ref_counts_blocks_written, }; -/**********************************************************************/ -/** number of dirty (resident) pages */ +/* number of dirty (resident) pages */ static ssize_t pool_stats_print_block_map_dirty_pages(struct vdo_statistics *stats, char *buf) { @@ -497,8 +466,7 @@ static struct pool_stats_attribute pool_stats_attr_block_map_dirty_pages = { .print = pool_stats_print_block_map_dirty_pages, }; -/**********************************************************************/ -/** number of clean (resident) pages */ +/* number of clean (resident) pages */ static ssize_t pool_stats_print_block_map_clean_pages(struct vdo_statistics *stats, char *buf) { @@ -510,8 +478,7 @@ static struct pool_stats_attribute pool_stats_attr_block_map_clean_pages = { .print = pool_stats_print_block_map_clean_pages, }; -/**********************************************************************/ -/** number of free pages */ +/* number of free pages */ static ssize_t pool_stats_print_block_map_free_pages(struct vdo_statistics *stats, char *buf) { @@ -523,8 +490,7 @@ static struct pool_stats_attribute pool_stats_attr_block_map_free_pages = { .print = pool_stats_print_block_map_free_pages, }; -/**********************************************************************/ -/** number of pages in failed state */ +/* number of pages in failed state */ static ssize_t pool_stats_print_block_map_failed_pages(struct vdo_statistics *stats, char *buf) { @@ -536,8 +502,7 @@ static struct pool_stats_attribute pool_stats_attr_block_map_failed_pages = { .print = pool_stats_print_block_map_failed_pages, }; -/**********************************************************************/ -/** number of pages incoming */ +/* number of pages incoming */ static ssize_t pool_stats_print_block_map_incoming_pages(struct vdo_statistics *stats, char *buf) { @@ -549,8 +514,7 @@ static struct pool_stats_attribute pool_stats_attr_block_map_incoming_pages = { .print = pool_stats_print_block_map_incoming_pages, }; -/**********************************************************************/ -/** number of pages outgoing */ +/* number of pages outgoing */ static ssize_t pool_stats_print_block_map_outgoing_pages(struct vdo_statistics *stats, char *buf) { @@ -562,8 +526,7 @@ static struct pool_stats_attribute pool_stats_attr_block_map_outgoing_pages = { .print = pool_stats_print_block_map_outgoing_pages, }; -/**********************************************************************/ -/** how many times free page not avail */ +/* how many times free page not avail */ static ssize_t pool_stats_print_block_map_cache_pressure(struct vdo_statistics *stats, char *buf) { @@ -575,8 +538,7 @@ static struct pool_stats_attribute pool_stats_attr_block_map_cache_pressure = { .print = pool_stats_print_block_map_cache_pressure, }; -/**********************************************************************/ -/** number of get_vdo_page() calls for read */ +/* number of get_vdo_page() calls for read */ static ssize_t pool_stats_print_block_map_read_count(struct vdo_statistics *stats, char *buf) { @@ -588,8 +550,7 @@ static struct pool_stats_attribute pool_stats_attr_block_map_read_count = { .print = pool_stats_print_block_map_read_count, }; -/**********************************************************************/ -/** number of get_vdo_page() calls for write */ +/* number of get_vdo_page() calls for write */ static ssize_t pool_stats_print_block_map_write_count(struct vdo_statistics *stats, char *buf) { @@ -601,8 +562,7 @@ static struct pool_stats_attribute pool_stats_attr_block_map_write_count = { .print = pool_stats_print_block_map_write_count, }; -/**********************************************************************/ -/** number of times pages failed to read */ +/* number of times pages failed to read */ static ssize_t pool_stats_print_block_map_failed_reads(struct vdo_statistics *stats, char *buf) { @@ -614,8 +574,7 @@ static struct pool_stats_attribute pool_stats_attr_block_map_failed_reads = { .print = pool_stats_print_block_map_failed_reads, }; -/**********************************************************************/ -/** number of times pages failed to write */ +/* number of times pages failed to write */ static ssize_t pool_stats_print_block_map_failed_writes(struct vdo_statistics *stats, char *buf) { @@ -627,8 +586,7 @@ static struct pool_stats_attribute pool_stats_attr_block_map_failed_writes = { .print = pool_stats_print_block_map_failed_writes, }; -/**********************************************************************/ -/** number of gets that are reclaimed */ +/* number of gets that are reclaimed */ static ssize_t pool_stats_print_block_map_reclaimed(struct vdo_statistics *stats, char *buf) { @@ -640,8 +598,7 @@ static struct pool_stats_attribute pool_stats_attr_block_map_reclaimed = { .print = pool_stats_print_block_map_reclaimed, }; -/**********************************************************************/ -/** number of gets for outgoing pages */ +/* number of gets for outgoing pages */ static ssize_t pool_stats_print_block_map_read_outgoing(struct vdo_statistics *stats, char *buf) { @@ -653,8 +610,7 @@ static struct pool_stats_attribute pool_stats_attr_block_map_read_outgoing = { .print = pool_stats_print_block_map_read_outgoing, }; -/**********************************************************************/ -/** number of gets that were already there */ +/* number of gets that were already there */ static ssize_t pool_stats_print_block_map_found_in_cache(struct vdo_statistics *stats, char *buf) { @@ -666,8 +622,7 @@ static struct pool_stats_attribute pool_stats_attr_block_map_found_in_cache = { .print = pool_stats_print_block_map_found_in_cache, }; -/**********************************************************************/ -/** number of gets requiring discard */ +/* number of gets requiring discard */ static ssize_t pool_stats_print_block_map_discard_required(struct vdo_statistics *stats, char *buf) { @@ -679,8 +634,7 @@ static struct pool_stats_attribute pool_stats_attr_block_map_discard_required = .print = pool_stats_print_block_map_discard_required, }; -/**********************************************************************/ -/** number of gets enqueued for their page */ +/* number of gets enqueued for their page */ static ssize_t pool_stats_print_block_map_wait_for_page(struct vdo_statistics *stats, char *buf) { @@ -692,8 +646,7 @@ static struct pool_stats_attribute pool_stats_attr_block_map_wait_for_page = { .print = pool_stats_print_block_map_wait_for_page, }; -/**********************************************************************/ -/** number of gets that have to fetch */ +/* number of gets that have to fetch */ static ssize_t pool_stats_print_block_map_fetch_required(struct vdo_statistics *stats, char *buf) { @@ -705,8 +658,7 @@ static struct pool_stats_attribute pool_stats_attr_block_map_fetch_required = { .print = pool_stats_print_block_map_fetch_required, }; -/**********************************************************************/ -/** number of page fetches */ +/* number of page fetches */ static ssize_t pool_stats_print_block_map_pages_loaded(struct vdo_statistics *stats, char *buf) { @@ -718,8 +670,7 @@ static struct pool_stats_attribute pool_stats_attr_block_map_pages_loaded = { .print = pool_stats_print_block_map_pages_loaded, }; -/**********************************************************************/ -/** number of page saves */ +/* number of page saves */ static ssize_t pool_stats_print_block_map_pages_saved(struct vdo_statistics *stats, char *buf) { @@ -731,8 +682,7 @@ static struct pool_stats_attribute pool_stats_attr_block_map_pages_saved = { .print = pool_stats_print_block_map_pages_saved, }; -/**********************************************************************/ -/** the number of flushes issued */ +/* the number of flushes issued */ static ssize_t pool_stats_print_block_map_flush_count(struct vdo_statistics *stats, char *buf) { @@ -744,8 +694,7 @@ static struct pool_stats_attribute pool_stats_attr_block_map_flush_count = { .print = pool_stats_print_block_map_flush_count, }; -/**********************************************************************/ -/** Number of times the UDS advice proved correct */ +/* Number of times the UDS advice proved correct */ static ssize_t pool_stats_print_hash_lock_dedupe_advice_valid(struct vdo_statistics *stats, char *buf) { @@ -757,8 +706,7 @@ static struct pool_stats_attribute pool_stats_attr_hash_lock_dedupe_advice_valid .print = pool_stats_print_hash_lock_dedupe_advice_valid, }; -/**********************************************************************/ -/** Number of times the UDS advice proved incorrect */ +/* Number of times the UDS advice proved incorrect */ static ssize_t pool_stats_print_hash_lock_dedupe_advice_stale(struct vdo_statistics *stats, char *buf) { @@ -770,8 +718,7 @@ static struct pool_stats_attribute pool_stats_attr_hash_lock_dedupe_advice_stale .print = pool_stats_print_hash_lock_dedupe_advice_stale, }; -/**********************************************************************/ -/** Number of writes with the same data as another in-flight write */ +/* Number of writes with the same data as another in-flight write */ static ssize_t pool_stats_print_hash_lock_concurrent_data_matches(struct vdo_statistics *stats, char *buf) { @@ -783,8 +730,7 @@ static struct pool_stats_attribute pool_stats_attr_hash_lock_concurrent_data_mat .print = pool_stats_print_hash_lock_concurrent_data_matches, }; -/**********************************************************************/ -/** Number of writes whose hash collided with an in-flight write */ +/* Number of writes whose hash collided with an in-flight write */ static ssize_t pool_stats_print_hash_lock_concurrent_hash_collisions(struct vdo_statistics *stats, char *buf) { @@ -796,8 +742,7 @@ static struct pool_stats_attribute pool_stats_attr_hash_lock_concurrent_hash_col .print = pool_stats_print_hash_lock_concurrent_hash_collisions, }; -/**********************************************************************/ -/** number of times VDO got an invalid dedupe advice PBN from UDS */ +/* number of times VDO got an invalid dedupe advice PBN from UDS */ static ssize_t pool_stats_print_errors_invalid_advice_pbn_count(struct vdo_statistics *stats, char *buf) { @@ -809,8 +754,7 @@ static struct pool_stats_attribute pool_stats_attr_errors_invalid_advice_pbn_cou .print = pool_stats_print_errors_invalid_advice_pbn_count, }; -/**********************************************************************/ -/** number of times a VIO completed with a VDO_NO_SPACE error */ +/* number of times a VIO completed with a VDO_NO_SPACE error */ static ssize_t pool_stats_print_errors_no_space_error_count(struct vdo_statistics *stats, char *buf) { @@ -822,8 +766,7 @@ static struct pool_stats_attribute pool_stats_attr_errors_no_space_error_count = .print = pool_stats_print_errors_no_space_error_count, }; -/**********************************************************************/ -/** number of times a VIO completed with a VDO_READ_ONLY error */ +/* number of times a VIO completed with a VDO_READ_ONLY error */ static ssize_t pool_stats_print_errors_read_only_error_count(struct vdo_statistics *stats, char *buf) { @@ -835,8 +778,7 @@ static struct pool_stats_attribute pool_stats_attr_errors_read_only_error_count .print = pool_stats_print_errors_read_only_error_count, }; -/**********************************************************************/ -/** The VDO instance */ +/* The VDO instance */ static ssize_t pool_stats_print_instance(struct vdo_statistics *stats, char *buf) { @@ -848,8 +790,7 @@ static struct pool_stats_attribute pool_stats_attr_instance = { .print = pool_stats_print_instance, }; -/**********************************************************************/ -/** Current number of active VIOs */ +/* Current number of active VIOs */ static ssize_t pool_stats_print_current_vios_in_progress(struct vdo_statistics *stats, char *buf) { @@ -861,8 +802,7 @@ static struct pool_stats_attribute pool_stats_attr_current_vios_in_progress = { .print = pool_stats_print_current_vios_in_progress, }; -/**********************************************************************/ -/** Maximum number of active VIOs */ +/* Maximum number of active VIOs */ static ssize_t pool_stats_print_max_vios(struct vdo_statistics *stats, char *buf) { @@ -874,8 +814,7 @@ static struct pool_stats_attribute pool_stats_attr_max_vios = { .print = pool_stats_print_max_vios, }; -/**********************************************************************/ -/** Number of times the UDS index was too slow in responding */ +/* Number of times the UDS index was too slow in responding */ static ssize_t pool_stats_print_dedupe_advice_timeouts(struct vdo_statistics *stats, char *buf) { @@ -887,8 +826,7 @@ static struct pool_stats_attribute pool_stats_attr_dedupe_advice_timeouts = { .print = pool_stats_print_dedupe_advice_timeouts, }; -/**********************************************************************/ -/** Number of flush requests submitted to the storage device */ +/* Number of flush requests submitted to the storage device */ static ssize_t pool_stats_print_flush_out(struct vdo_statistics *stats, char *buf) { @@ -900,8 +838,7 @@ static struct pool_stats_attribute pool_stats_attr_flush_out = { .print = pool_stats_print_flush_out, }; -/**********************************************************************/ -/** Logical block size */ +/* Logical block size */ static ssize_t pool_stats_print_logical_block_size(struct vdo_statistics *stats, char *buf) { @@ -913,8 +850,7 @@ static struct pool_stats_attribute pool_stats_attr_logical_block_size = { .print = pool_stats_print_logical_block_size, }; -/**********************************************************************/ -/** Number of REQ_OP_READ bios */ +/* Number of REQ_OP_READ bios */ static ssize_t pool_stats_print_bios_in_read(struct vdo_statistics *stats, char *buf) { @@ -926,8 +862,7 @@ static struct pool_stats_attribute pool_stats_attr_bios_in_read = { .print = pool_stats_print_bios_in_read, }; -/**********************************************************************/ -/** Number of REQ_OP_WRITE bios with data */ +/* Number of REQ_OP_WRITE bios with data */ static ssize_t pool_stats_print_bios_in_write(struct vdo_statistics *stats, char *buf) { @@ -939,8 +874,7 @@ static struct pool_stats_attribute pool_stats_attr_bios_in_write = { .print = pool_stats_print_bios_in_write, }; -/**********************************************************************/ -/** Number of bios tagged with REQ_PREFLUSH and containing no data */ +/* Number of bios tagged with REQ_PREFLUSH and containing no data */ static ssize_t pool_stats_print_bios_in_empty_flush(struct vdo_statistics *stats, char *buf) { @@ -952,8 +886,7 @@ static struct pool_stats_attribute pool_stats_attr_bios_in_empty_flush = { .print = pool_stats_print_bios_in_empty_flush, }; -/**********************************************************************/ -/** Number of REQ_OP_DISCARD bios */ +/* Number of REQ_OP_DISCARD bios */ static ssize_t pool_stats_print_bios_in_discard(struct vdo_statistics *stats, char *buf) { @@ -965,8 +898,7 @@ static struct pool_stats_attribute pool_stats_attr_bios_in_discard = { .print = pool_stats_print_bios_in_discard, }; -/**********************************************************************/ -/** Number of bios tagged with REQ_PREFLUSH */ +/* Number of bios tagged with REQ_PREFLUSH */ static ssize_t pool_stats_print_bios_in_flush(struct vdo_statistics *stats, char *buf) { @@ -978,8 +910,7 @@ static struct pool_stats_attribute pool_stats_attr_bios_in_flush = { .print = pool_stats_print_bios_in_flush, }; -/**********************************************************************/ -/** Number of bios tagged with REQ_FUA */ +/* Number of bios tagged with REQ_FUA */ static ssize_t pool_stats_print_bios_in_fua(struct vdo_statistics *stats, char *buf) { @@ -991,8 +922,7 @@ static struct pool_stats_attribute pool_stats_attr_bios_in_fua = { .print = pool_stats_print_bios_in_fua, }; -/**********************************************************************/ -/** Number of REQ_OP_READ bios */ +/* Number of REQ_OP_READ bios */ static ssize_t pool_stats_print_bios_in_partial_read(struct vdo_statistics *stats, char *buf) { @@ -1004,8 +934,7 @@ static struct pool_stats_attribute pool_stats_attr_bios_in_partial_read = { .print = pool_stats_print_bios_in_partial_read, }; -/**********************************************************************/ -/** Number of REQ_OP_WRITE bios with data */ +/* Number of REQ_OP_WRITE bios with data */ static ssize_t pool_stats_print_bios_in_partial_write(struct vdo_statistics *stats, char *buf) { @@ -1017,8 +946,7 @@ static struct pool_stats_attribute pool_stats_attr_bios_in_partial_write = { .print = pool_stats_print_bios_in_partial_write, }; -/**********************************************************************/ -/** Number of bios tagged with REQ_PREFLUSH and containing no data */ +/* Number of bios tagged with REQ_PREFLUSH and containing no data */ static ssize_t pool_stats_print_bios_in_partial_empty_flush(struct vdo_statistics *stats, char *buf) { @@ -1030,8 +958,7 @@ static struct pool_stats_attribute pool_stats_attr_bios_in_partial_empty_flush = .print = pool_stats_print_bios_in_partial_empty_flush, }; -/**********************************************************************/ -/** Number of REQ_OP_DISCARD bios */ +/* Number of REQ_OP_DISCARD bios */ static ssize_t pool_stats_print_bios_in_partial_discard(struct vdo_statistics *stats, char *buf) { @@ -1043,8 +970,7 @@ static struct pool_stats_attribute pool_stats_attr_bios_in_partial_discard = { .print = pool_stats_print_bios_in_partial_discard, }; -/**********************************************************************/ -/** Number of bios tagged with REQ_PREFLUSH */ +/* Number of bios tagged with REQ_PREFLUSH */ static ssize_t pool_stats_print_bios_in_partial_flush(struct vdo_statistics *stats, char *buf) { @@ -1056,8 +982,7 @@ static struct pool_stats_attribute pool_stats_attr_bios_in_partial_flush = { .print = pool_stats_print_bios_in_partial_flush, }; -/**********************************************************************/ -/** Number of bios tagged with REQ_FUA */ +/* Number of bios tagged with REQ_FUA */ static ssize_t pool_stats_print_bios_in_partial_fua(struct vdo_statistics *stats, char *buf) { @@ -1069,8 +994,7 @@ static struct pool_stats_attribute pool_stats_attr_bios_in_partial_fua = { .print = pool_stats_print_bios_in_partial_fua, }; -/**********************************************************************/ -/** Number of REQ_OP_READ bios */ +/* Number of REQ_OP_READ bios */ static ssize_t pool_stats_print_bios_out_read(struct vdo_statistics *stats, char *buf) { @@ -1082,8 +1006,7 @@ static struct pool_stats_attribute pool_stats_attr_bios_out_read = { .print = pool_stats_print_bios_out_read, }; -/**********************************************************************/ -/** Number of REQ_OP_WRITE bios with data */ +/* Number of REQ_OP_WRITE bios with data */ static ssize_t pool_stats_print_bios_out_write(struct vdo_statistics *stats, char *buf) { @@ -1095,8 +1018,7 @@ static struct pool_stats_attribute pool_stats_attr_bios_out_write = { .print = pool_stats_print_bios_out_write, }; -/**********************************************************************/ -/** Number of bios tagged with REQ_PREFLUSH and containing no data */ +/* Number of bios tagged with REQ_PREFLUSH and containing no data */ static ssize_t pool_stats_print_bios_out_empty_flush(struct vdo_statistics *stats, char *buf) { @@ -1108,8 +1030,7 @@ static struct pool_stats_attribute pool_stats_attr_bios_out_empty_flush = { .print = pool_stats_print_bios_out_empty_flush, }; -/**********************************************************************/ -/** Number of REQ_OP_DISCARD bios */ +/* Number of REQ_OP_DISCARD bios */ static ssize_t pool_stats_print_bios_out_discard(struct vdo_statistics *stats, char *buf) { @@ -1121,8 +1042,7 @@ static struct pool_stats_attribute pool_stats_attr_bios_out_discard = { .print = pool_stats_print_bios_out_discard, }; -/**********************************************************************/ -/** Number of bios tagged with REQ_PREFLUSH */ +/* Number of bios tagged with REQ_PREFLUSH */ static ssize_t pool_stats_print_bios_out_flush(struct vdo_statistics *stats, char *buf) { @@ -1134,8 +1054,7 @@ static struct pool_stats_attribute pool_stats_attr_bios_out_flush = { .print = pool_stats_print_bios_out_flush, }; -/**********************************************************************/ -/** Number of bios tagged with REQ_FUA */ +/* Number of bios tagged with REQ_FUA */ static ssize_t pool_stats_print_bios_out_fua(struct vdo_statistics *stats, char *buf) { @@ -1147,8 +1066,7 @@ static struct pool_stats_attribute pool_stats_attr_bios_out_fua = { .print = pool_stats_print_bios_out_fua, }; -/**********************************************************************/ -/** Number of REQ_OP_READ bios */ +/* Number of REQ_OP_READ bios */ static ssize_t pool_stats_print_bios_meta_read(struct vdo_statistics *stats, char *buf) { @@ -1160,8 +1078,7 @@ static struct pool_stats_attribute pool_stats_attr_bios_meta_read = { .print = pool_stats_print_bios_meta_read, }; -/**********************************************************************/ -/** Number of REQ_OP_WRITE bios with data */ +/* Number of REQ_OP_WRITE bios with data */ static ssize_t pool_stats_print_bios_meta_write(struct vdo_statistics *stats, char *buf) { @@ -1173,8 +1090,7 @@ static struct pool_stats_attribute pool_stats_attr_bios_meta_write = { .print = pool_stats_print_bios_meta_write, }; -/**********************************************************************/ -/** Number of bios tagged with REQ_PREFLUSH and containing no data */ +/* Number of bios tagged with REQ_PREFLUSH and containing no data */ static ssize_t pool_stats_print_bios_meta_empty_flush(struct vdo_statistics *stats, char *buf) { @@ -1186,8 +1102,7 @@ static struct pool_stats_attribute pool_stats_attr_bios_meta_empty_flush = { .print = pool_stats_print_bios_meta_empty_flush, }; -/**********************************************************************/ -/** Number of REQ_OP_DISCARD bios */ +/* Number of REQ_OP_DISCARD bios */ static ssize_t pool_stats_print_bios_meta_discard(struct vdo_statistics *stats, char *buf) { @@ -1199,8 +1114,7 @@ static struct pool_stats_attribute pool_stats_attr_bios_meta_discard = { .print = pool_stats_print_bios_meta_discard, }; -/**********************************************************************/ -/** Number of bios tagged with REQ_PREFLUSH */ +/* Number of bios tagged with REQ_PREFLUSH */ static ssize_t pool_stats_print_bios_meta_flush(struct vdo_statistics *stats, char *buf) { @@ -1212,8 +1126,7 @@ static struct pool_stats_attribute pool_stats_attr_bios_meta_flush = { .print = pool_stats_print_bios_meta_flush, }; -/**********************************************************************/ -/** Number of bios tagged with REQ_FUA */ +/* Number of bios tagged with REQ_FUA */ static ssize_t pool_stats_print_bios_meta_fua(struct vdo_statistics *stats, char *buf) { @@ -1225,8 +1138,7 @@ static struct pool_stats_attribute pool_stats_attr_bios_meta_fua = { .print = pool_stats_print_bios_meta_fua, }; -/**********************************************************************/ -/** Number of REQ_OP_READ bios */ +/* Number of REQ_OP_READ bios */ static ssize_t pool_stats_print_bios_journal_read(struct vdo_statistics *stats, char *buf) { @@ -1238,8 +1150,7 @@ static struct pool_stats_attribute pool_stats_attr_bios_journal_read = { .print = pool_stats_print_bios_journal_read, }; -/**********************************************************************/ -/** Number of REQ_OP_WRITE bios with data */ +/* Number of REQ_OP_WRITE bios with data */ static ssize_t pool_stats_print_bios_journal_write(struct vdo_statistics *stats, char *buf) { @@ -1251,8 +1162,7 @@ static struct pool_stats_attribute pool_stats_attr_bios_journal_write = { .print = pool_stats_print_bios_journal_write, }; -/**********************************************************************/ -/** Number of bios tagged with REQ_PREFLUSH and containing no data */ +/* Number of bios tagged with REQ_PREFLUSH and containing no data */ static ssize_t pool_stats_print_bios_journal_empty_flush(struct vdo_statistics *stats, char *buf) { @@ -1264,8 +1174,7 @@ static struct pool_stats_attribute pool_stats_attr_bios_journal_empty_flush = { .print = pool_stats_print_bios_journal_empty_flush, }; -/**********************************************************************/ -/** Number of REQ_OP_DISCARD bios */ +/* Number of REQ_OP_DISCARD bios */ static ssize_t pool_stats_print_bios_journal_discard(struct vdo_statistics *stats, char *buf) { @@ -1277,8 +1186,7 @@ static struct pool_stats_attribute pool_stats_attr_bios_journal_discard = { .print = pool_stats_print_bios_journal_discard, }; -/**********************************************************************/ -/** Number of bios tagged with REQ_PREFLUSH */ +/* Number of bios tagged with REQ_PREFLUSH */ static ssize_t pool_stats_print_bios_journal_flush(struct vdo_statistics *stats, char *buf) { @@ -1290,8 +1198,7 @@ static struct pool_stats_attribute pool_stats_attr_bios_journal_flush = { .print = pool_stats_print_bios_journal_flush, }; -/**********************************************************************/ -/** Number of bios tagged with REQ_FUA */ +/* Number of bios tagged with REQ_FUA */ static ssize_t pool_stats_print_bios_journal_fua(struct vdo_statistics *stats, char *buf) { @@ -1303,8 +1210,7 @@ static struct pool_stats_attribute pool_stats_attr_bios_journal_fua = { .print = pool_stats_print_bios_journal_fua, }; -/**********************************************************************/ -/** Number of REQ_OP_READ bios */ +/* Number of REQ_OP_READ bios */ static ssize_t pool_stats_print_bios_page_cache_read(struct vdo_statistics *stats, char *buf) { @@ -1316,8 +1222,7 @@ static struct pool_stats_attribute pool_stats_attr_bios_page_cache_read = { .print = pool_stats_print_bios_page_cache_read, }; -/**********************************************************************/ -/** Number of REQ_OP_WRITE bios with data */ +/* Number of REQ_OP_WRITE bios with data */ static ssize_t pool_stats_print_bios_page_cache_write(struct vdo_statistics *stats, char *buf) { @@ -1329,8 +1234,7 @@ static struct pool_stats_attribute pool_stats_attr_bios_page_cache_write = { .print = pool_stats_print_bios_page_cache_write, }; -/**********************************************************************/ -/** Number of bios tagged with REQ_PREFLUSH and containing no data */ +/* Number of bios tagged with REQ_PREFLUSH and containing no data */ static ssize_t pool_stats_print_bios_page_cache_empty_flush(struct vdo_statistics *stats, char *buf) { @@ -1342,8 +1246,7 @@ static struct pool_stats_attribute pool_stats_attr_bios_page_cache_empty_flush = .print = pool_stats_print_bios_page_cache_empty_flush, }; -/**********************************************************************/ -/** Number of REQ_OP_DISCARD bios */ +/* Number of REQ_OP_DISCARD bios */ static ssize_t pool_stats_print_bios_page_cache_discard(struct vdo_statistics *stats, char *buf) { @@ -1355,8 +1258,7 @@ static struct pool_stats_attribute pool_stats_attr_bios_page_cache_discard = { .print = pool_stats_print_bios_page_cache_discard, }; -/**********************************************************************/ -/** Number of bios tagged with REQ_PREFLUSH */ +/* Number of bios tagged with REQ_PREFLUSH */ static ssize_t pool_stats_print_bios_page_cache_flush(struct vdo_statistics *stats, char *buf) { @@ -1368,8 +1270,7 @@ static struct pool_stats_attribute pool_stats_attr_bios_page_cache_flush = { .print = pool_stats_print_bios_page_cache_flush, }; -/**********************************************************************/ -/** Number of bios tagged with REQ_FUA */ +/* Number of bios tagged with REQ_FUA */ static ssize_t pool_stats_print_bios_page_cache_fua(struct vdo_statistics *stats, char *buf) { @@ -1381,8 +1282,7 @@ static struct pool_stats_attribute pool_stats_attr_bios_page_cache_fua = { .print = pool_stats_print_bios_page_cache_fua, }; -/**********************************************************************/ -/** Number of REQ_OP_READ bios */ +/* Number of REQ_OP_READ bios */ static ssize_t pool_stats_print_bios_out_completed_read(struct vdo_statistics *stats, char *buf) { @@ -1394,8 +1294,7 @@ static struct pool_stats_attribute pool_stats_attr_bios_out_completed_read = { .print = pool_stats_print_bios_out_completed_read, }; -/**********************************************************************/ -/** Number of REQ_OP_WRITE bios with data */ +/* Number of REQ_OP_WRITE bios with data */ static ssize_t pool_stats_print_bios_out_completed_write(struct vdo_statistics *stats, char *buf) { @@ -1407,8 +1306,7 @@ static struct pool_stats_attribute pool_stats_attr_bios_out_completed_write = { .print = pool_stats_print_bios_out_completed_write, }; -/**********************************************************************/ -/** Number of bios tagged with REQ_PREFLUSH and containing no data */ +/* Number of bios tagged with REQ_PREFLUSH and containing no data */ static ssize_t pool_stats_print_bios_out_completed_empty_flush(struct vdo_statistics *stats, char *buf) { @@ -1420,8 +1318,7 @@ static struct pool_stats_attribute pool_stats_attr_bios_out_completed_empty_flus .print = pool_stats_print_bios_out_completed_empty_flush, }; -/**********************************************************************/ -/** Number of REQ_OP_DISCARD bios */ +/* Number of REQ_OP_DISCARD bios */ static ssize_t pool_stats_print_bios_out_completed_discard(struct vdo_statistics *stats, char *buf) { @@ -1433,8 +1330,7 @@ static struct pool_stats_attribute pool_stats_attr_bios_out_completed_discard = .print = pool_stats_print_bios_out_completed_discard, }; -/**********************************************************************/ -/** Number of bios tagged with REQ_PREFLUSH */ +/* Number of bios tagged with REQ_PREFLUSH */ static ssize_t pool_stats_print_bios_out_completed_flush(struct vdo_statistics *stats, char *buf) { @@ -1446,8 +1342,7 @@ static struct pool_stats_attribute pool_stats_attr_bios_out_completed_flush = { .print = pool_stats_print_bios_out_completed_flush, }; -/**********************************************************************/ -/** Number of bios tagged with REQ_FUA */ +/* Number of bios tagged with REQ_FUA */ static ssize_t pool_stats_print_bios_out_completed_fua(struct vdo_statistics *stats, char *buf) { @@ -1459,8 +1354,7 @@ static struct pool_stats_attribute pool_stats_attr_bios_out_completed_fua = { .print = pool_stats_print_bios_out_completed_fua, }; -/**********************************************************************/ -/** Number of REQ_OP_READ bios */ +/* Number of REQ_OP_READ bios */ static ssize_t pool_stats_print_bios_meta_completed_read(struct vdo_statistics *stats, char *buf) { @@ -1472,8 +1366,7 @@ static struct pool_stats_attribute pool_stats_attr_bios_meta_completed_read = { .print = pool_stats_print_bios_meta_completed_read, }; -/**********************************************************************/ -/** Number of REQ_OP_WRITE bios with data */ +/* Number of REQ_OP_WRITE bios with data */ static ssize_t pool_stats_print_bios_meta_completed_write(struct vdo_statistics *stats, char *buf) { @@ -1485,8 +1378,7 @@ static struct pool_stats_attribute pool_stats_attr_bios_meta_completed_write = { .print = pool_stats_print_bios_meta_completed_write, }; -/**********************************************************************/ -/** Number of bios tagged with REQ_PREFLUSH and containing no data */ +/* Number of bios tagged with REQ_PREFLUSH and containing no data */ static ssize_t pool_stats_print_bios_meta_completed_empty_flush(struct vdo_statistics *stats, char *buf) { @@ -1498,8 +1390,7 @@ static struct pool_stats_attribute pool_stats_attr_bios_meta_completed_empty_flu .print = pool_stats_print_bios_meta_completed_empty_flush, }; -/**********************************************************************/ -/** Number of REQ_OP_DISCARD bios */ +/* Number of REQ_OP_DISCARD bios */ static ssize_t pool_stats_print_bios_meta_completed_discard(struct vdo_statistics *stats, char *buf) { @@ -1511,8 +1402,7 @@ static struct pool_stats_attribute pool_stats_attr_bios_meta_completed_discard = .print = pool_stats_print_bios_meta_completed_discard, }; -/**********************************************************************/ -/** Number of bios tagged with REQ_PREFLUSH */ +/* Number of bios tagged with REQ_PREFLUSH */ static ssize_t pool_stats_print_bios_meta_completed_flush(struct vdo_statistics *stats, char *buf) { @@ -1524,8 +1414,7 @@ static struct pool_stats_attribute pool_stats_attr_bios_meta_completed_flush = { .print = pool_stats_print_bios_meta_completed_flush, }; -/**********************************************************************/ -/** Number of bios tagged with REQ_FUA */ +/* Number of bios tagged with REQ_FUA */ static ssize_t pool_stats_print_bios_meta_completed_fua(struct vdo_statistics *stats, char *buf) { @@ -1537,8 +1426,7 @@ static struct pool_stats_attribute pool_stats_attr_bios_meta_completed_fua = { .print = pool_stats_print_bios_meta_completed_fua, }; -/**********************************************************************/ -/** Number of REQ_OP_READ bios */ +/* Number of REQ_OP_READ bios */ static ssize_t pool_stats_print_bios_journal_completed_read(struct vdo_statistics *stats, char *buf) { @@ -1550,8 +1438,7 @@ static struct pool_stats_attribute pool_stats_attr_bios_journal_completed_read = .print = pool_stats_print_bios_journal_completed_read, }; -/**********************************************************************/ -/** Number of REQ_OP_WRITE bios with data */ +/* Number of REQ_OP_WRITE bios with data */ static ssize_t pool_stats_print_bios_journal_completed_write(struct vdo_statistics *stats, char *buf) { @@ -1563,8 +1450,7 @@ static struct pool_stats_attribute pool_stats_attr_bios_journal_completed_write .print = pool_stats_print_bios_journal_completed_write, }; -/**********************************************************************/ -/** Number of bios tagged with REQ_PREFLUSH and containing no data */ +/* Number of bios tagged with REQ_PREFLUSH and containing no data */ static ssize_t pool_stats_print_bios_journal_completed_empty_flush(struct vdo_statistics *stats, char *buf) { @@ -1576,8 +1462,7 @@ static struct pool_stats_attribute pool_stats_attr_bios_journal_completed_empty_ .print = pool_stats_print_bios_journal_completed_empty_flush, }; -/**********************************************************************/ -/** Number of REQ_OP_DISCARD bios */ +/* Number of REQ_OP_DISCARD bios */ static ssize_t pool_stats_print_bios_journal_completed_discard(struct vdo_statistics *stats, char *buf) { @@ -1589,8 +1474,7 @@ static struct pool_stats_attribute pool_stats_attr_bios_journal_completed_discar .print = pool_stats_print_bios_journal_completed_discard, }; -/**********************************************************************/ -/** Number of bios tagged with REQ_PREFLUSH */ +/* Number of bios tagged with REQ_PREFLUSH */ static ssize_t pool_stats_print_bios_journal_completed_flush(struct vdo_statistics *stats, char *buf) { @@ -1602,8 +1486,7 @@ static struct pool_stats_attribute pool_stats_attr_bios_journal_completed_flush .print = pool_stats_print_bios_journal_completed_flush, }; -/**********************************************************************/ -/** Number of bios tagged with REQ_FUA */ +/* Number of bios tagged with REQ_FUA */ static ssize_t pool_stats_print_bios_journal_completed_fua(struct vdo_statistics *stats, char *buf) { @@ -1615,8 +1498,7 @@ static struct pool_stats_attribute pool_stats_attr_bios_journal_completed_fua = .print = pool_stats_print_bios_journal_completed_fua, }; -/**********************************************************************/ -/** Number of REQ_OP_READ bios */ +/* Number of REQ_OP_READ bios */ static ssize_t pool_stats_print_bios_page_cache_completed_read(struct vdo_statistics *stats, char *buf) { @@ -1628,8 +1510,7 @@ static struct pool_stats_attribute pool_stats_attr_bios_page_cache_completed_rea .print = pool_stats_print_bios_page_cache_completed_read, }; -/**********************************************************************/ -/** Number of REQ_OP_WRITE bios with data */ +/* Number of REQ_OP_WRITE bios with data */ static ssize_t pool_stats_print_bios_page_cache_completed_write(struct vdo_statistics *stats, char *buf) { @@ -1641,8 +1522,7 @@ static struct pool_stats_attribute pool_stats_attr_bios_page_cache_completed_wri .print = pool_stats_print_bios_page_cache_completed_write, }; -/**********************************************************************/ -/** Number of bios tagged with REQ_PREFLUSH and containing no data */ +/* Number of bios tagged with REQ_PREFLUSH and containing no data */ static ssize_t pool_stats_print_bios_page_cache_completed_empty_flush(struct vdo_statistics *stats, char *buf) { @@ -1654,8 +1534,7 @@ static struct pool_stats_attribute pool_stats_attr_bios_page_cache_completed_emp .print = pool_stats_print_bios_page_cache_completed_empty_flush, }; -/**********************************************************************/ -/** Number of REQ_OP_DISCARD bios */ +/* Number of REQ_OP_DISCARD bios */ static ssize_t pool_stats_print_bios_page_cache_completed_discard(struct vdo_statistics *stats, char *buf) { @@ -1667,8 +1546,7 @@ static struct pool_stats_attribute pool_stats_attr_bios_page_cache_completed_dis .print = pool_stats_print_bios_page_cache_completed_discard, }; -/**********************************************************************/ -/** Number of bios tagged with REQ_PREFLUSH */ +/* Number of bios tagged with REQ_PREFLUSH */ static ssize_t pool_stats_print_bios_page_cache_completed_flush(struct vdo_statistics *stats, char *buf) { @@ -1680,8 +1558,7 @@ static struct pool_stats_attribute pool_stats_attr_bios_page_cache_completed_flu .print = pool_stats_print_bios_page_cache_completed_flush, }; -/**********************************************************************/ -/** Number of bios tagged with REQ_FUA */ +/* Number of bios tagged with REQ_FUA */ static ssize_t pool_stats_print_bios_page_cache_completed_fua(struct vdo_statistics *stats, char *buf) { @@ -1693,8 +1570,7 @@ static struct pool_stats_attribute pool_stats_attr_bios_page_cache_completed_fua .print = pool_stats_print_bios_page_cache_completed_fua, }; -/**********************************************************************/ -/** Number of REQ_OP_READ bios */ +/* Number of REQ_OP_READ bios */ static ssize_t pool_stats_print_bios_acknowledged_read(struct vdo_statistics *stats, char *buf) { @@ -1706,8 +1582,7 @@ static struct pool_stats_attribute pool_stats_attr_bios_acknowledged_read = { .print = pool_stats_print_bios_acknowledged_read, }; -/**********************************************************************/ -/** Number of REQ_OP_WRITE bios with data */ +/* Number of REQ_OP_WRITE bios with data */ static ssize_t pool_stats_print_bios_acknowledged_write(struct vdo_statistics *stats, char *buf) { @@ -1719,8 +1594,7 @@ static struct pool_stats_attribute pool_stats_attr_bios_acknowledged_write = { .print = pool_stats_print_bios_acknowledged_write, }; -/**********************************************************************/ -/** Number of bios tagged with REQ_PREFLUSH and containing no data */ +/* Number of bios tagged with REQ_PREFLUSH and containing no data */ static ssize_t pool_stats_print_bios_acknowledged_empty_flush(struct vdo_statistics *stats, char *buf) { @@ -1732,8 +1606,7 @@ static struct pool_stats_attribute pool_stats_attr_bios_acknowledged_empty_flush .print = pool_stats_print_bios_acknowledged_empty_flush, }; -/**********************************************************************/ -/** Number of REQ_OP_DISCARD bios */ +/* Number of REQ_OP_DISCARD bios */ static ssize_t pool_stats_print_bios_acknowledged_discard(struct vdo_statistics *stats, char *buf) { @@ -1745,8 +1618,7 @@ static struct pool_stats_attribute pool_stats_attr_bios_acknowledged_discard = { .print = pool_stats_print_bios_acknowledged_discard, }; -/**********************************************************************/ -/** Number of bios tagged with REQ_PREFLUSH */ +/* Number of bios tagged with REQ_PREFLUSH */ static ssize_t pool_stats_print_bios_acknowledged_flush(struct vdo_statistics *stats, char *buf) { @@ -1758,8 +1630,7 @@ static struct pool_stats_attribute pool_stats_attr_bios_acknowledged_flush = { .print = pool_stats_print_bios_acknowledged_flush, }; -/**********************************************************************/ -/** Number of bios tagged with REQ_FUA */ +/* Number of bios tagged with REQ_FUA */ static ssize_t pool_stats_print_bios_acknowledged_fua(struct vdo_statistics *stats, char *buf) { @@ -1771,8 +1642,7 @@ static struct pool_stats_attribute pool_stats_attr_bios_acknowledged_fua = { .print = pool_stats_print_bios_acknowledged_fua, }; -/**********************************************************************/ -/** Number of REQ_OP_READ bios */ +/* Number of REQ_OP_READ bios */ static ssize_t pool_stats_print_bios_acknowledged_partial_read(struct vdo_statistics *stats, char *buf) { @@ -1784,8 +1654,7 @@ static struct pool_stats_attribute pool_stats_attr_bios_acknowledged_partial_rea .print = pool_stats_print_bios_acknowledged_partial_read, }; -/**********************************************************************/ -/** Number of REQ_OP_WRITE bios with data */ +/* Number of REQ_OP_WRITE bios with data */ static ssize_t pool_stats_print_bios_acknowledged_partial_write(struct vdo_statistics *stats, char *buf) { @@ -1797,8 +1666,7 @@ static struct pool_stats_attribute pool_stats_attr_bios_acknowledged_partial_wri .print = pool_stats_print_bios_acknowledged_partial_write, }; -/**********************************************************************/ -/** Number of bios tagged with REQ_PREFLUSH and containing no data */ +/* Number of bios tagged with REQ_PREFLUSH and containing no data */ static ssize_t pool_stats_print_bios_acknowledged_partial_empty_flush(struct vdo_statistics *stats, char *buf) { @@ -1810,8 +1678,7 @@ static struct pool_stats_attribute pool_stats_attr_bios_acknowledged_partial_emp .print = pool_stats_print_bios_acknowledged_partial_empty_flush, }; -/**********************************************************************/ -/** Number of REQ_OP_DISCARD bios */ +/* Number of REQ_OP_DISCARD bios */ static ssize_t pool_stats_print_bios_acknowledged_partial_discard(struct vdo_statistics *stats, char *buf) { @@ -1823,8 +1690,7 @@ static struct pool_stats_attribute pool_stats_attr_bios_acknowledged_partial_dis .print = pool_stats_print_bios_acknowledged_partial_discard, }; -/**********************************************************************/ -/** Number of bios tagged with REQ_PREFLUSH */ +/* Number of bios tagged with REQ_PREFLUSH */ static ssize_t pool_stats_print_bios_acknowledged_partial_flush(struct vdo_statistics *stats, char *buf) { @@ -1836,8 +1702,7 @@ static struct pool_stats_attribute pool_stats_attr_bios_acknowledged_partial_flu .print = pool_stats_print_bios_acknowledged_partial_flush, }; -/**********************************************************************/ -/** Number of bios tagged with REQ_FUA */ +/* Number of bios tagged with REQ_FUA */ static ssize_t pool_stats_print_bios_acknowledged_partial_fua(struct vdo_statistics *stats, char *buf) { @@ -1849,8 +1714,7 @@ static struct pool_stats_attribute pool_stats_attr_bios_acknowledged_partial_fua .print = pool_stats_print_bios_acknowledged_partial_fua, }; -/**********************************************************************/ -/** Number of REQ_OP_READ bios */ +/* Number of REQ_OP_READ bios */ static ssize_t pool_stats_print_bios_in_progress_read(struct vdo_statistics *stats, char *buf) { @@ -1862,8 +1726,7 @@ static struct pool_stats_attribute pool_stats_attr_bios_in_progress_read = { .print = pool_stats_print_bios_in_progress_read, }; -/**********************************************************************/ -/** Number of REQ_OP_WRITE bios with data */ +/* Number of REQ_OP_WRITE bios with data */ static ssize_t pool_stats_print_bios_in_progress_write(struct vdo_statistics *stats, char *buf) { @@ -1875,8 +1738,7 @@ static struct pool_stats_attribute pool_stats_attr_bios_in_progress_write = { .print = pool_stats_print_bios_in_progress_write, }; -/**********************************************************************/ -/** Number of bios tagged with REQ_PREFLUSH and containing no data */ +/* Number of bios tagged with REQ_PREFLUSH and containing no data */ static ssize_t pool_stats_print_bios_in_progress_empty_flush(struct vdo_statistics *stats, char *buf) { @@ -1888,8 +1750,7 @@ static struct pool_stats_attribute pool_stats_attr_bios_in_progress_empty_flush .print = pool_stats_print_bios_in_progress_empty_flush, }; -/**********************************************************************/ -/** Number of REQ_OP_DISCARD bios */ +/* Number of REQ_OP_DISCARD bios */ static ssize_t pool_stats_print_bios_in_progress_discard(struct vdo_statistics *stats, char *buf) { @@ -1901,8 +1762,7 @@ static struct pool_stats_attribute pool_stats_attr_bios_in_progress_discard = { .print = pool_stats_print_bios_in_progress_discard, }; -/**********************************************************************/ -/** Number of bios tagged with REQ_PREFLUSH */ +/* Number of bios tagged with REQ_PREFLUSH */ static ssize_t pool_stats_print_bios_in_progress_flush(struct vdo_statistics *stats, char *buf) { @@ -1914,8 +1774,7 @@ static struct pool_stats_attribute pool_stats_attr_bios_in_progress_flush = { .print = pool_stats_print_bios_in_progress_flush, }; -/**********************************************************************/ -/** Number of bios tagged with REQ_FUA */ +/* Number of bios tagged with REQ_FUA */ static ssize_t pool_stats_print_bios_in_progress_fua(struct vdo_statistics *stats, char *buf) { @@ -1927,8 +1786,7 @@ static struct pool_stats_attribute pool_stats_attr_bios_in_progress_fua = { .print = pool_stats_print_bios_in_progress_fua, }; -/**********************************************************************/ -/** Tracked bytes currently allocated. */ +/* Tracked bytes currently allocated. */ static ssize_t pool_stats_print_memory_usage_bytes_used(struct vdo_statistics *stats, char *buf) { @@ -1940,8 +1798,7 @@ static struct pool_stats_attribute pool_stats_attr_memory_usage_bytes_used = { .print = pool_stats_print_memory_usage_bytes_used, }; -/**********************************************************************/ -/** Maximum tracked bytes allocated. */ +/* Maximum tracked bytes allocated. */ static ssize_t pool_stats_print_memory_usage_peak_bytes_used(struct vdo_statistics *stats, char *buf) { @@ -1953,8 +1810,7 @@ static struct pool_stats_attribute pool_stats_attr_memory_usage_peak_bytes_used .print = pool_stats_print_memory_usage_peak_bytes_used, }; -/**********************************************************************/ -/** Number of chunk names stored in the index */ +/* Number of chunk names stored in the index */ static ssize_t pool_stats_print_index_entries_indexed(struct vdo_statistics *stats, char *buf) { @@ -1966,8 +1822,7 @@ static struct pool_stats_attribute pool_stats_attr_index_entries_indexed = { .print = pool_stats_print_index_entries_indexed, }; -/**********************************************************************/ -/** Number of post calls that found an existing entry */ +/* Number of post calls that found an existing entry */ static ssize_t pool_stats_print_index_posts_found(struct vdo_statistics *stats, char *buf) { @@ -1979,8 +1834,7 @@ static struct pool_stats_attribute pool_stats_attr_index_posts_found = { .print = pool_stats_print_index_posts_found, }; -/**********************************************************************/ -/** Number of post calls that added a new entry */ +/* Number of post calls that added a new entry */ static ssize_t pool_stats_print_index_posts_not_found(struct vdo_statistics *stats, char *buf) { @@ -1992,8 +1846,7 @@ static struct pool_stats_attribute pool_stats_attr_index_posts_not_found = { .print = pool_stats_print_index_posts_not_found, }; -/**********************************************************************/ -/** Number of query calls that found an existing entry */ +/* Number of query calls that found an existing entry */ static ssize_t pool_stats_print_index_queries_found(struct vdo_statistics *stats, char *buf) { @@ -2005,8 +1858,7 @@ static struct pool_stats_attribute pool_stats_attr_index_queries_found = { .print = pool_stats_print_index_queries_found, }; -/**********************************************************************/ -/** Number of query calls that added a new entry */ +/* Number of query calls that added a new entry */ static ssize_t pool_stats_print_index_queries_not_found(struct vdo_statistics *stats, char *buf) { @@ -2018,8 +1870,7 @@ static struct pool_stats_attribute pool_stats_attr_index_queries_not_found = { .print = pool_stats_print_index_queries_not_found, }; -/**********************************************************************/ -/** Number of update calls that found an existing entry */ +/* Number of update calls that found an existing entry */ static ssize_t pool_stats_print_index_updates_found(struct vdo_statistics *stats, char *buf) { @@ -2031,8 +1882,7 @@ static struct pool_stats_attribute pool_stats_attr_index_updates_found = { .print = pool_stats_print_index_updates_found, }; -/**********************************************************************/ -/** Number of update calls that added a new entry */ +/* Number of update calls that added a new entry */ static ssize_t pool_stats_print_index_updates_not_found(struct vdo_statistics *stats, char *buf) { @@ -2044,8 +1894,7 @@ static struct pool_stats_attribute pool_stats_attr_index_updates_not_found = { .print = pool_stats_print_index_updates_not_found, }; -/**********************************************************************/ -/** Current number of dedupe queries that are in flight */ +/* Current number of dedupe queries that are in flight */ static ssize_t pool_stats_print_index_curr_dedupe_queries(struct vdo_statistics *stats, char *buf) { @@ -2057,8 +1906,7 @@ static struct pool_stats_attribute pool_stats_attr_index_curr_dedupe_queries = { .print = pool_stats_print_index_curr_dedupe_queries, }; -/**********************************************************************/ -/** Maximum number of dedupe queries that have been in flight */ +/* Maximum number of dedupe queries that have been in flight */ static ssize_t pool_stats_print_index_max_dedupe_queries(struct vdo_statistics *stats, char *buf) { diff --git a/vdo/poolSysfs.c b/vdo/pool-sysfs.c similarity index 52% rename from vdo/poolSysfs.c rename to vdo/pool-sysfs.c index 00113be4..2d03e095 100644 --- a/vdo/poolSysfs.c +++ b/vdo/pool-sysfs.c @@ -1,39 +1,22 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/kernel/poolSysfs.c#13 $ */ -#include "poolSysfs.h" +#include "pool-sysfs.h" -#include "memoryAlloc.h" +#include "memory-alloc.h" +#include "data-vio-pool.h" +#include "dedupe-index.h" #include "vdo.h" -#include "dedupeIndex.h" - struct pool_attribute { struct attribute attr; ssize_t (*show)(struct vdo *vdo, char *buf); ssize_t (*store)(struct vdo *vdo, const char *value, size_t count); }; -/**********************************************************************/ static ssize_t vdo_pool_attr_show(struct kobject *directory, struct attribute *attr, char *buf) @@ -42,13 +25,13 @@ static ssize_t vdo_pool_attr_show(struct kobject *directory, struct pool_attribute, attr); struct vdo *vdo = container_of(directory, struct vdo, vdo_directory); + if (pool_attr->show == NULL) { return -EINVAL; } return pool_attr->show(vdo, buf); } -/**********************************************************************/ static ssize_t vdo_pool_attr_store(struct kobject *directory, struct attribute *attr, const char *buf, @@ -58,6 +41,7 @@ static ssize_t vdo_pool_attr_store(struct kobject *directory, struct pool_attribute, attr); struct vdo *vdo = container_of(directory, struct vdo, vdo_directory); + if (pool_attr->store == NULL) { return -EINVAL; } @@ -69,75 +53,81 @@ static struct sysfs_ops vdo_pool_sysfs_ops = { .store = vdo_pool_attr_store, }; -/**********************************************************************/ static ssize_t pool_compressing_show(struct vdo *vdo, char *buf) { return sprintf(buf, "%s\n", - (get_vdo_compressing(vdo) ? "1" : "0")); + (vdo_get_compressing(vdo) ? "1" : "0")); } -/**********************************************************************/ static ssize_t pool_discards_active_show(struct vdo *vdo, char *buf) { - return sprintf(buf, "%u\n", vdo->discard_limiter.active); + return sprintf(buf, + "%u\n", + get_data_vio_pool_active_discards(vdo->data_vio_pool)); } -/**********************************************************************/ static ssize_t pool_discards_limit_show(struct vdo *vdo, char *buf) { - return sprintf(buf, "%u\n", vdo->discard_limiter.limit); + return sprintf(buf, + "%u\n", + get_data_vio_pool_discard_limit(vdo->data_vio_pool)); } -/**********************************************************************/ static ssize_t pool_discards_limit_store(struct vdo *vdo, const char *buf, size_t length) { unsigned int value; + int result; if ((length > 12) || (sscanf(buf, "%u", &value) != 1) || (value < 1)) { return -EINVAL; } - vdo->discard_limiter.limit = value; + + result = set_data_vio_pool_discard_limit(vdo->data_vio_pool, value); + if (result != VDO_SUCCESS) { + return -EINVAL; + } + return length; } -/**********************************************************************/ static ssize_t pool_discards_maximum_show(struct vdo *vdo, char *buf) { - return sprintf(buf, "%u\n", vdo->discard_limiter.maximum); + return sprintf(buf, + "%u\n", + get_data_vio_pool_maximum_discards(vdo->data_vio_pool)); } -/**********************************************************************/ static ssize_t pool_instance_show(struct vdo *vdo, char *buf) { return sprintf(buf, "%u\n", vdo->instance); } -/**********************************************************************/ static ssize_t pool_requests_active_show(struct vdo *vdo, char *buf) { - return sprintf(buf, "%u\n", vdo->request_limiter.active); + return sprintf(buf, + "%u\n", + get_data_vio_pool_active_requests(vdo->data_vio_pool)); } -/**********************************************************************/ static ssize_t pool_requests_limit_show(struct vdo *vdo, char *buf) { - return sprintf(buf, "%u\n", vdo->request_limiter.limit); + return sprintf(buf, + "%u\n", + get_data_vio_pool_request_limit(vdo->data_vio_pool)); } -/**********************************************************************/ static ssize_t pool_requests_maximum_show(struct vdo *vdo, char *buf) { - return sprintf(buf, "%u\n", vdo->request_limiter.maximum); + return sprintf(buf, + "%u\n", + get_data_vio_pool_maximum_requests(vdo->data_vio_pool)); } -/**********************************************************************/ static void vdo_pool_release(struct kobject *directory) { - struct vdo *vdo = container_of(directory, struct vdo, vdo_directory); - struct kernel_layer *layer = vdo_as_kernel_layer(vdo); - UDS_FREE(layer); + UDS_FREE(container_of(directory, struct vdo, vdo_directory)); } static struct pool_attribute vdo_pool_compressing_attr = { @@ -216,44 +206,10 @@ static struct attribute *pool_attrs[] = { &vdo_pool_requests_maximum_attr.attr, NULL, }; +ATTRIBUTE_GROUPS(pool); struct kobj_type vdo_directory_type = { .release = vdo_pool_release, .sysfs_ops = &vdo_pool_sysfs_ops, - .default_attrs = pool_attrs, -}; - -/**********************************************************************/ -static void work_queue_directory_release(struct kobject *kobj) -{ - /* - * The work_queue_directory holds an implicit reference to its parent, - * the VDO object (->kobj), so even if there are some external - * references held to the work_queue_directory when work queue - * shutdown calls kobject_put on the VDO object, the VDO object won't - * actually be released and won't free the VDO storage until the - * work_queue_directory object is released first. - * - * So, we don't need to do any additional explicit management here. - * - * (But we aren't allowed to use a NULL function pointer to indicate - * a no-op.) - */ -} - -/**********************************************************************/ -static struct attribute *no_attrs[] = { - NULL, -}; - -static struct sysfs_ops no_sysfs_ops = { - // These should never be reachable since there are no attributes. - .show = NULL, - .store = NULL, -}; - -struct kobj_type vdo_work_queue_directory_type = { - .release = work_queue_directory_release, - .sysfs_ops = &no_sysfs_ops, - .default_attrs = no_attrs, + .default_groups = pool_groups, }; diff --git a/vdo/pool-sysfs.h b/vdo/pool-sysfs.h new file mode 100644 index 00000000..775d9ccc --- /dev/null +++ b/vdo/pool-sysfs.h @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright Red Hat + */ + +#ifndef POOL_SYSFS_H +#define POOL_SYSFS_H + +#include + +/* The kobj_type used for setting up the kernel layer kobject. */ +extern struct kobj_type vdo_directory_type; + +/* The sysfs_ops used for the "statistics" subdirectory. */ +extern struct sysfs_ops vdo_pool_stats_sysfs_ops; +/* The attribute used for the "statistics" subdirectory. */ +extern struct attribute *vdo_pool_stats_attrs[]; + +#endif /* POOL_SYSFS_H */ diff --git a/vdo/poolSysfs.h b/vdo/poolSysfs.h deleted file mode 100644 index faf5bd1f..00000000 --- a/vdo/poolSysfs.h +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/kernel/poolSysfs.h#7 $ - */ - -#ifndef POOL_SYSFS_H -#define POOL_SYSFS_H - -#include - -// The kobj_type used for setting up the kernel layer kobject. -extern struct kobj_type vdo_directory_type; -// The kobj_type used for the "work_queues" subdirectory. -extern struct kobj_type vdo_work_queue_directory_type; - -// The sysfs_ops used for the "statistics" subdirectory. -extern struct sysfs_ops vdo_pool_stats_sysfs_ops; -// The attribute used for the "statistics" subdirectory. -extern struct attribute *vdo_pool_stats_attrs[]; - -#endif /* POOL_SYSFS_H */ diff --git a/vdo/priority-table.c b/vdo/priority-table.c new file mode 100644 index 00000000..75103129 --- /dev/null +++ b/vdo/priority-table.c @@ -0,0 +1,240 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright Red Hat + */ + +#include "priority-table.h" + +#include "errors.h" +#include "memory-alloc.h" +#include "num-utils.h" +#include "permassert.h" + +#include "status-codes.h" + +/* We use a single 64-bit search vector, so the maximum priority is 63 */ +enum { + MAX_PRIORITY = 63 +}; + +/* + * All the entries with the same priority are queued in a circular list in a + * bucket for that priority. The table is essentially an array of buckets. + */ +struct bucket { + /* + * The head of a queue of table entries, all having the same priority + */ + struct list_head queue; + /* The priority of all the entries in this bucket */ + unsigned int priority; +}; + +/* + * A priority table is an array of buckets, indexed by priority. New entries + * are added to the end of the queue in the appropriate bucket. The dequeue + * operation finds the highest-priority non-empty bucket by searching a bit + * vector represented as a single 8-byte word, which is very fast with + * compiler and CPU support. + */ +struct priority_table { + /* The maximum priority of entries that may be stored in this table */ + unsigned int max_priority; + /* A bit vector flagging all buckets that are currently non-empty */ + uint64_t search_vector; + /* The array of all buckets, indexed by priority */ + struct bucket buckets[]; +}; + +/** + * make_priority_table() - Allocate and initialize a new priority_table. + * @max_priority: The maximum priority value for table entries. + * @table_ptr: A pointer to hold the new table. + * + * Return: VDO_SUCCESS or an error code. + */ +int make_priority_table(unsigned int max_priority, + struct priority_table **table_ptr) +{ + struct priority_table *table; + int result; + unsigned int priority; + + if (max_priority > MAX_PRIORITY) { + return UDS_INVALID_ARGUMENT; + } + + result = UDS_ALLOCATE_EXTENDED(struct priority_table, max_priority + 1, + struct bucket, __func__, &table); + if (result != VDO_SUCCESS) { + return result; + } + + for (priority = 0; priority <= max_priority; priority++) { + struct bucket *bucket = &table->buckets[priority]; + + bucket->priority = priority; + INIT_LIST_HEAD(&bucket->queue); + } + + table->max_priority = max_priority; + table->search_vector = 0; + + *table_ptr = table; + return VDO_SUCCESS; +} + +/** + * free_priority_table() - Free a priority_table. + * @table: The table to free. + * + * The table does not own the entries stored in it and they are not freed by + * this call. + */ +void free_priority_table(struct priority_table *table) +{ + if (table == NULL) { + return; + } + + /* + * Unlink the buckets from any entries still in the table so the entries + * won't be left with dangling pointers to freed memory. + */ + reset_priority_table(table); + + UDS_FREE(table); +} + +/** + * reset_priority_table() - Reset a priority table, leaving it in the same + * empty state as when newly constructed. + * @table: The table to reset. + * + * The table does not own the entries stored in it and they are not freed (or + * even unlinked from each other) by this call. + */ +void reset_priority_table(struct priority_table *table) +{ + unsigned int priority; + + table->search_vector = 0; + for (priority = 0; priority <= table->max_priority; priority++) { + list_del_init(&table->buckets[priority].queue); + } +} + +/** + * priority_table_enqueue() - Add a new entry to the priority table, appending + * it to the queue for entries with the specified + * priority. + * @table: The table in which to store the entry. + * @priority: The priority of the entry. + * @entry: The list_head embedded in the entry to store in the table + * (the caller must have initialized it). + */ +void priority_table_enqueue(struct priority_table *table, unsigned int priority, + struct list_head *entry) +{ + ASSERT_LOG_ONLY((priority <= table->max_priority), + "entry priority must be valid for the table"); + + /* Append the entry to the queue in the specified bucket. */ + list_move_tail(entry, &table->buckets[priority].queue); + + /* Flag the bucket in the search vector since it must be non-empty. */ + table->search_vector |= (1ULL << priority); +} + +static inline void mark_bucket_empty(struct priority_table *table, + struct bucket *bucket) +{ + table->search_vector &= ~(1ULL << bucket->priority); +} + +/** + * priority_table_dequeue() - Find the highest-priority entry in the table, + * remove it from the table, and return it. + * @table: The priority table from which to remove an entry. + * + * If there are multiple entries with the same priority, the one that has been + * in the table with that priority the longest will be returned. + * + * Return: The dequeued entry, or NULL if the table is currently empty. + */ +struct list_head *priority_table_dequeue(struct priority_table *table) +{ + struct bucket *bucket; + struct list_head *entry; + int top_priority; + + if (table->search_vector == 0) { + /* All buckets are empty. */ + return NULL; + } + + /* + * Find the highest priority non-empty bucket by finding the + * highest-order non-zero bit in the search vector. + */ + top_priority = ilog2(table->search_vector); + + /* Dequeue the first entry in the bucket. */ + bucket = &table->buckets[top_priority]; + entry = (bucket->queue.next); + list_del_init(entry); + + /* Clear the bit in the search vector if the bucket has been emptied. */ + if (list_empty(&bucket->queue)) { + mark_bucket_empty(table, bucket); + } + + return entry; +} + +/** + * priority_table_remove() - Remove a specified entry from its priority table. + * @table: The table from which to remove the entry. + * @entry: The entry to remove from the table. + */ +void priority_table_remove(struct priority_table *table, + struct list_head *entry) +{ + struct list_head *next_entry; + + /* + * We can't guard against calls where the entry is on a list for a + * different table, but it's easy to deal with an entry not in any table + * or list. + */ + if (list_empty(entry)) { + return; + } + + /* + * Remove the entry from the bucket list, remembering a pointer to + * another entry in the ring. + */ + next_entry = entry->next; + list_del_init(entry); + + /* + * If the rest of the list is now empty, the next node must be the list + * head in the bucket and we can use it to update the search vector. + */ + if (list_empty(next_entry)) { + mark_bucket_empty(table, list_entry(next_entry, + struct bucket, queue)); + } +} + +/** + * is_priority_table_empty() - Return whether the priority table is empty. + * @table: The table to check. + * + * Return: true if the table is empty. + */ +bool is_priority_table_empty(struct priority_table *table) +{ + return (table->search_vector == 0); +} diff --git a/vdo/priority-table.h b/vdo/priority-table.h new file mode 100644 index 00000000..9d1b7cd8 --- /dev/null +++ b/vdo/priority-table.h @@ -0,0 +1,53 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright Red Hat + */ + +#ifndef PRIORITY_TABLE_H +#define PRIORITY_TABLE_H + +#include + +/** + * A priority_table is a simple implementation of a priority queue for entries + * with priorities that are small non-negative integer values. It implements + * the obvious priority queue operations of enqueuing an entry and dequeuing + * an entry with the maximum priority. It also supports removing an arbitrary + * entry. The priority of an entry already in the table can be changed by + * removing it and re-enqueuing it with a different priority. All operations + * have O(1) complexity. + * + * The links for the table entries must be embedded in the entries themselves. + * Lists are used to link entries in the table and no wrapper type is + * declared, so an existing list entry in an object can also be used to + * queue it in a priority_table, assuming the field is not used for anything + * else while so queued. + * + * The table is implemented as an array of queues (circular lists) indexed by + * priority, along with a hint for which queues are non-empty. Steven Skiena + * calls a very similar structure a "bounded height priority queue", but given + * the resemblance to a hash table, "priority table" seems both shorter and + * more apt, if somewhat novel. + **/ + +struct priority_table; + +int __must_check make_priority_table(unsigned int max_priority, + struct priority_table **table_ptr); + +void free_priority_table(struct priority_table *table); + +void priority_table_enqueue(struct priority_table *table, unsigned int priority, + struct list_head *entry); + +void reset_priority_table(struct priority_table *table); + +struct list_head * __must_check +priority_table_dequeue(struct priority_table *table); + +void priority_table_remove(struct priority_table *table, + struct list_head *entry); + +bool __must_check is_priority_table_empty(struct priority_table *table); + +#endif /* PRIORITY_TABLE_H */ diff --git a/vdo/priorityTable.c b/vdo/priorityTable.c deleted file mode 100644 index b26cbacc..00000000 --- a/vdo/priorityTable.c +++ /dev/null @@ -1,199 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/priorityTable.c#13 $ - */ - -#include "priorityTable.h" - -#include "errors.h" -#include "memoryAlloc.h" -#include "numUtils.h" -#include "permassert.h" - -#include "statusCodes.h" - -/** We use a single 64-bit search vector, so the maximum priority is 63 */ -enum { - MAX_PRIORITY = 63 -}; - -/** - * All the entries with the same priority are queued in a circular list in a - * bucket for that priority. The table is essentially an array of buckets. - **/ -struct bucket { - /** - * The head of a queue of table entries, all having the same priority - */ - struct list_head queue; - /** The priority of all the entries in this bucket */ - unsigned int priority; -}; - -/** - * A priority table is an array of buckets, indexed by priority. New entries - * are added to the end of the queue in the appropriate bucket. The dequeue - * operation finds the highest-priority non-empty bucket by searching a bit - * vector represented as a single 8-byte word, which is very fast with - * compiler and CPU support. - **/ -struct priority_table { - /** The maximum priority of entries that may be stored in this table */ - unsigned int max_priority; - /** A bit vector flagging all buckets that are currently non-empty */ - uint64_t search_vector; - /** The array of all buckets, indexed by priority */ - struct bucket buckets[]; -}; - -/**********************************************************************/ -int make_priority_table(unsigned int max_priority, - struct priority_table **table_ptr) -{ - struct priority_table *table; - int result; - unsigned int priority; - - if (max_priority > MAX_PRIORITY) { - return UDS_INVALID_ARGUMENT; - } - - result = UDS_ALLOCATE_EXTENDED(struct priority_table, max_priority + 1, - struct bucket, __func__, &table); - if (result != VDO_SUCCESS) { - return result; - } - - for (priority = 0; priority <= max_priority; priority++) { - struct bucket *bucket = &table->buckets[priority]; - bucket->priority = priority; - INIT_LIST_HEAD(&bucket->queue); - } - - table->max_priority = max_priority; - table->search_vector = 0; - - *table_ptr = table; - return VDO_SUCCESS; -} - -/**********************************************************************/ -void free_priority_table(struct priority_table *table) -{ - if (table == NULL) { - return; - } - - // Unlink the buckets from any entries still in the table so the entries - // won't be left with dangling pointers to freed memory. - reset_priority_table(table); - - UDS_FREE(table); -} - -/**********************************************************************/ -void reset_priority_table(struct priority_table *table) -{ - unsigned int priority; - table->search_vector = 0; - for (priority = 0; priority <= table->max_priority; priority++) { - list_del_init(&table->buckets[priority].queue); - } -} - -/**********************************************************************/ -void priority_table_enqueue(struct priority_table *table, unsigned int priority, - struct list_head *entry) -{ - ASSERT_LOG_ONLY((priority <= table->max_priority), - "entry priority must be valid for the table"); - - // Append the entry to the queue in the specified bucket. - list_move_tail(entry, &table->buckets[priority].queue); - - // Flag the bucket in the search vector since it must be non-empty. - table->search_vector |= (1ULL << priority); -} - -/**********************************************************************/ -static inline void mark_bucket_empty(struct priority_table *table, - struct bucket *bucket) -{ - table->search_vector &= ~(1ULL << bucket->priority); -} - -/**********************************************************************/ -struct list_head *priority_table_dequeue(struct priority_table *table) -{ - struct bucket *bucket; - struct list_head *entry; - - // Find the highest priority non-empty bucket by finding the - // highest-order non-zero bit in the search vector. - int top_priority = log_base_two(table->search_vector); - - if (top_priority < 0) { - // All buckets are empty. - return NULL; - } - - // Dequeue the first entry in the bucket. - bucket = &table->buckets[top_priority]; - entry = (bucket->queue.next); - list_del_init(entry); - - // Clear the bit in the search vector if the bucket has been emptied. - if (list_empty(&bucket->queue)) { - mark_bucket_empty(table, bucket); - } - - return entry; -} - -/**********************************************************************/ -void priority_table_remove(struct priority_table *table, - struct list_head *entry) -{ - struct list_head *next_entry; - - // We can't guard against calls where the entry is on a list for a - // different table, but it's easy to deal with an entry not in any table - // or list. - if (list_empty(entry)) { - return; - } - - // Remove the entry from the bucket list, remembering a pointer to - // another entry in the ring. - next_entry = entry->next; - list_del_init(entry); - - // If the rest of the list is now empty, the next node must be the list - // head in the bucket and we can use it to update the search vector. - if (list_empty(next_entry)) { - mark_bucket_empty(table, list_entry(next_entry, - struct bucket, queue)); - } -} - -/**********************************************************************/ -bool is_priority_table_empty(struct priority_table *table) -{ - return (table->search_vector == 0); -} diff --git a/vdo/priorityTable.h b/vdo/priorityTable.h deleted file mode 100644 index 370d83d6..00000000 --- a/vdo/priorityTable.h +++ /dev/null @@ -1,121 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/priorityTable.h#6 $ - */ - -#ifndef PRIORITY_TABLE_H -#define PRIORITY_TABLE_H - -#include - -/** - * A priority_table is a simple implementation of a priority queue for entries - * with priorities that are small non-negative integer values. It implements - * the obvious priority queue operations of enqueuing an entry and dequeuing - * an entry with the maximum priority. It also supports removing an arbitrary - * entry. The priority of an entry already in the table can be changed by - * removing it and re-enqueuing it with a different priority. All operations - * have O(1) complexity. - * - * The links for the table entries must be embedded in the entries themselves. - * Lists are used to link entries in the table and no wrapper type is - * declared, so an existing list entry in an object can also be used to - * queue it in a priority_table, assuming the field is not used for anything - * else while so queued. - * - * The table is implemented as an array of queues (circular lists) indexed by - * priority, along with a hint for which queues are non-empty. Steven Skiena - * calls a very similar structure a "bounded height priority queue", but given - * the resemblance to a hash table, "priority table" seems both shorter and - * more apt, if somewhat novel. - **/ - -struct priority_table; - -/** - * Allocate and initialize a new priority_table. - * - * @param [in] max_priority The maximum priority value for table entries - * @param [out] table_ptr A pointer to hold the new table - * - * @return VDO_SUCCESS or an error code - **/ -int __must_check make_priority_table(unsigned int max_priority, - struct priority_table **table_ptr); - -/** - * Free a priority_table. NOTE: The table does not own the entries stored in - * it and they are not freed by this call. - * - * @param table The table to free - **/ -void free_priority_table(struct priority_table *table); - -/** - * Add a new entry to the priority table, appending it to the queue for - * entries with the specified priority. - * - * @param table The table in which to store the entry - * @param priority The priority of the entry - * @param entry The list_head embedded in the entry to store in the table - * (the caller must have initialized it) - **/ -void priority_table_enqueue(struct priority_table *table, unsigned int priority, - struct list_head *entry); - -/** - * Reset a priority table, leaving it in the same empty state as when newly - * constructed. NOTE: The table does not own the entries stored in it and they - * are not freed (or even unlinked from each other) by this call. - * - * @param table The table to reset - **/ -void reset_priority_table(struct priority_table *table); - -/** - * Find the highest-priority entry in the table, remove it from the table, and - * return it. If there are multiple entries with the same priority, the one - * that has been in the table with that priority the longest will be returned. - * - * @param table The priority table from which to remove an entry - * - * @return the dequeued entry, or NULL if the table is currently empty - **/ -struct list_head * __must_check -priority_table_dequeue(struct priority_table *table); - -/** - * Remove a specified entry from its priority table. - * - * @param table The table from which to remove the entry - * @param entry The entry to remove from the table - **/ -void priority_table_remove(struct priority_table *table, - struct list_head *entry); - -/** - * Return whether the priority table is empty. - * - * @param table The table to check - * - * @return true if the table is empty - **/ -bool __must_check is_priority_table_empty(struct priority_table *table); - -#endif /* PRIORITY_TABLE_H */ diff --git a/uds/util/radixSort.c b/vdo/radix-sort.c similarity index 68% rename from uds/util/radixSort.c rename to vdo/radix-sort.c index 5174e5dd..affa5884 100644 --- a/uds/util/radixSort.c +++ b/vdo/radix-sort.c @@ -1,22 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/krusty/src/uds/util/radixSort.c#10 $ */ /* @@ -28,20 +12,20 @@ * http://www.usenix.org/publications/compsystems/1993/win_mcilroy.pdf */ -#include "radixSort.h" +#include "radix-sort.h" #include "compiler.h" -#include "memoryAlloc.h" -#include "stringUtils.h" -#include "typeDefs.h" +#include "memory-alloc.h" +#include "string-utils.h" +#include "type-defs.h" #include "uds.h" enum { - // Piles smaller than this are handled with a simple insertion sort. + /* Piles smaller than this are handled with a simple insertion sort. */ INSERTION_SORT_THRESHOLD = 12 }; -// Sort keys are pointers to immutable fixed-length arrays of bytes. +/* Sort keys are pointers to immutable fixed-length arrays of bytes. */ typedef const uint8_t * sort_key_t; /** @@ -50,10 +34,10 @@ typedef const uint8_t * sort_key_t; * byte must be counted. **/ struct histogram { - uint16_t used; // number of non-empty bins - uint16_t first; // index (key byte) of the first non-empty bin - uint16_t last; // index (key byte) of the last non-empty bin - uint32_t size[256]; // size[byte] == # of occurrences of byte + uint16_t used; /* number of non-empty bins */ + uint16_t first; /* index (key byte) of the first non-empty bin */ + uint16_t last; /* index (key byte) of the last non-empty bin */ + uint32_t size[256]; /* size[byte] == # of occurrences of byte */ }; /** @@ -61,12 +45,12 @@ struct histogram { * and to put a logarithmic bound on the stack space needed. **/ struct task { - sort_key_t *first_key; // Pointers to first and last keys to sort, - // inclusive. + sort_key_t *first_key; /* Pointers to first and last keys to sort, */ + /* inclusive. */ sort_key_t *last_key; - uint16_t offset; // The offset into the key at which to - // continue sorting. - uint16_t length; // The number of bytes remaining in the sort keys. + uint16_t offset; /* The offset into the key at which to */ + /* continue sorting. */ + uint16_t length; /* The number of bytes remaining in the sort keys. */ }; struct radix_sorter { @@ -101,15 +85,17 @@ compare(sort_key_t key1, sort_key_t key2, uint16_t offset, uint16_t length) **/ static INLINE void insert_key(const struct task task, sort_key_t *next) { - // Pull the unsorted key out, freeing up the array slot. + /* Pull the unsorted key out, freeing up the array slot. */ sort_key_t unsorted = *next; - // Compare the key to the preceding sorted entries, shifting - // down the ones that are larger. + /* + * Compare the key to the preceding sorted entries, shifting + * down the ones that are larger. + */ while ((--next >= task.first_key) && (compare(unsorted, next[0], task.offset, task.length) < 0)) { next[1] = next[0]; } - // Insert the key into the last slot that was cleared, sorting it. + /* Insert the key into the last slot that was cleared, sorting it. */ next[1] = unsorted; } @@ -122,10 +108,13 @@ static INLINE void insert_key(const struct task task, sort_key_t *next) **/ static INLINE void insertion_sort(const struct task task) { - // (first_key .. first_key) is trivially sorted. Repeatedly - // insert the next key into the sorted list of keys preceding - // it, and voila! + /* + * (first_key .. first_key) is trivially sorted. Repeatedly + * insert the next key into the sorted list of keys preceding + * it, and voila! + */ sort_key_t *next; + for (next = task.first_key + 1; next <= task.last_key; next++) { insert_key(task, next); } @@ -141,13 +130,13 @@ static INLINE void push_task(struct task **stack_pointer, uint16_t length) { struct task *task = (*stack_pointer)++; + task->first_key = first_key; task->last_key = &first_key[count - 1]; task->offset = offset; task->length = length; } -/**********************************************************************/ static INLINE void swap_keys(sort_key_t *a, sort_key_t *b) { sort_key_t c = *a; @@ -166,24 +155,32 @@ static INLINE void swap_keys(sort_key_t *a, sort_key_t *b) static INLINE void measure_bins(const struct task task, struct histogram *bins) { sort_key_t *key_ptr; - // Set bogus values that will will be replaced by min and max, - // respectively. + /* + * Set bogus values that will will be replaced by min and max, + * respectively. + */ bins->first = UINT8_MAX; bins->last = 0; - // Subtle invariant: bins->used and bins->size[] are zero - // because the sorting code clears it all out as it goes. Even - // though this structure is re-used, we don't need to pay to - // zero it before starting a new tally. + /* + * Subtle invariant: bins->used and bins->size[] are zero + * because the sorting code clears it all out as it goes. Even + * though this structure is re-used, we don't need to pay to + * zero it before starting a new tally. + */ for (key_ptr = task.first_key; key_ptr <= task.last_key; key_ptr++) { - // Increment the count for the byte in the key at the - // current offset. + /* + * Increment the count for the byte in the key at the + * current offset. + */ uint8_t bin = (*key_ptr)[task.offset]; uint32_t size = ++bins->size[bin]; - // Track non-empty bins when the count transitions - // from zero to one. + /* + * Track non-empty bins when the count transitions + * from zero to one. + */ if (size == 1) { bins->used += 1; if (bin < bins->first) { @@ -229,13 +226,14 @@ static INLINE int push_bins(struct task **stack, { sort_key_t *pile_start = first_key; int bin; + for (bin = bins->first;; bin++) { uint32_t size = bins->size[bin]; - // Skip empty piles. + /* Skip empty piles. */ if (size == 0) { continue; } - // There's no need to sort empty keys. + /* There's no need to sort empty keys. */ if (length > 0) { if (size > INSERTION_SORT_THRESHOLD) { if (*stack >= end_of_stack) { @@ -263,7 +261,6 @@ static INLINE int push_bins(struct task **stack, return UDS_SUCCESS; } -/**********************************************************************/ int make_radix_sorter(unsigned int count, struct radix_sorter **sorter) { unsigned int stack_size = count / INSERTION_SORT_THRESHOLD; @@ -282,13 +279,11 @@ int make_radix_sorter(unsigned int count, struct radix_sorter **sorter) return UDS_SUCCESS; } -/**********************************************************************/ void free_radix_sorter(struct radix_sorter *sorter) { UDS_FREE(sorter); } -/**********************************************************************/ int radix_sort(struct radix_sorter *sorter, const unsigned char *keys[], unsigned int count, @@ -299,12 +294,12 @@ int radix_sort(struct radix_sorter *sorter, sort_key_t **pile = sorter->pile; struct task *sp = sorter->stack; - // All zero-length keys are identical and therefore already sorted. + /* All zero-length keys are identical and therefore already sorted. */ if ((count == 0) || (length == 0)) { return UDS_SUCCESS; } - // The initial task is to sort the entire length of all the keys. + /* The initial task is to sort the entire length of all the keys. */ start = (struct task) { .first_key = keys, .last_key = &keys[count - 1], @@ -335,9 +330,11 @@ int radix_sort(struct radix_sorter *sorter, measure_bins(task, bins); - // Now that we know how large each bin is, generate pointers for - // each of the piles and push a new task to sort each pile by - // the next radix byte. + /* + * Now that we know how large each bin is, generate pointers for + * each of the piles and push a new task to sort each pile by + * the next radix byte. + */ lp = sorter->is_list; result = push_bins(&sp, sorter->end_of_stack, @@ -351,34 +348,42 @@ int radix_sort(struct radix_sorter *sorter, memset(bins, 0, sizeof(*bins)); return result; } - // Now bins->used is zero again. + /* Now bins->used is zero again. */ - // Don't bother processing the last pile--when piles 0..N-1 are - // all in place, then pile N must also be in place. + /* + * Don't bother processing the last pile--when piles 0..N-1 are + * all in place, then pile N must also be in place. + */ end = task.last_key - bins->size[bins->last]; bins->size[bins->last] = 0; for (fence = task.first_key; fence <= end;) { uint8_t bin; sort_key_t key = *fence; - // The radix byte of the key tells us which pile it - // belongs in. Swap it for an unprocessed item just - // below that pile, and repeat. + /* + * The radix byte of the key tells us which pile it + * belongs in. Swap it for an unprocessed item just + * below that pile, and repeat. + */ while (--pile[bin = key[task.offset]] > fence) { swap_keys(pile[bin], &key); } - // The pile reached the fence. Put the key at the bottom - // of that pile. completing it, and advance the fence to - // the next pile. + /* + * The pile reached the fence. Put the key at the bottom + * of that pile. completing it, and advance the fence to + * the next pile. + */ *fence = key; fence += bins->size[bin]; bins->size[bin] = 0; } - // Now bins->size[] is all zero again. + /* Now bins->size[] is all zero again. */ - // When the number of keys in a task gets small enough, its - // faster to use an insertion sort than to keep subdividing into - // tiny piles. + /* + * When the number of keys in a task gets small enough, its + * faster to use an insertion sort than to keep subdividing into + * tiny piles. + */ while (--lp >= sorter->is_list) { insertion_sort(*lp); } diff --git a/uds/util/radixSort.h b/vdo/radix-sort.h similarity index 66% rename from uds/util/radixSort.h rename to vdo/radix-sort.h index a209dc74..f594ce31 100644 --- a/uds/util/radixSort.h +++ b/vdo/radix-sort.h @@ -1,22 +1,6 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/krusty/src/uds/util/radixSort.h#7 $ */ #ifndef RADIX_SORT_H diff --git a/vdo/random.c b/vdo/random.c new file mode 100644 index 00000000..f45e23fd --- /dev/null +++ b/vdo/random.c @@ -0,0 +1,19 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright Red Hat + */ + +#include "random.h" + +#include "permassert.h" + +unsigned int random_in_range(unsigned int lo, unsigned int hi) +{ + return lo + random() % (hi - lo + 1); +} + +void random_compile_time_assertions(void) +{ + STATIC_ASSERT((((uint64_t) RAND_MAX + 1) & RAND_MAX) == 0); +} + diff --git a/uds/random.h b/vdo/random.h similarity index 56% rename from uds/random.h rename to vdo/random.h index 6b5df868..5eaed5e5 100644 --- a/uds/random.h +++ b/vdo/random.h @@ -1,22 +1,6 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/krusty/src/uds/random.h#4 $ */ #ifndef RANDOM_H @@ -25,7 +9,7 @@ #include #include "compiler.h" -#include "typeDefs.h" +#include "type-defs.h" /** * Get random unsigned integer in a given range diff --git a/vdo/readOnlyNotifier.c b/vdo/read-only-notifier.c similarity index 52% rename from vdo/readOnlyNotifier.c rename to vdo/read-only-notifier.c index bc2868ca..dc3c48ce 100644 --- a/vdo/readOnlyNotifier.c +++ b/vdo/read-only-notifier.c @@ -1,37 +1,23 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/readOnlyNotifier.c#25 $ */ -#include "readOnlyNotifier.h" +#include "read-only-notifier.h" #include #include "logger.h" -#include "memoryAlloc.h" +#include "memory-alloc.h" #include "permassert.h" #include "completion.h" -#include "threadConfig.h" +#include "thread-config.h" #include "vdo.h" /** + * DOC: + * * A read_only_notifier has a single completion which is used to perform * read-only notifications, however, vdo_enter_read_only_mode() may be called * from any thread. A pair of atomic fields are used to control the read-only @@ -50,10 +36,10 @@ * read_only_error will not be VDO_SUCCESS. This configuration will indicate to * vdo_allow_read_only_mode_entry() that there is a pending notification to * perform. - **/ + */ enum { /** Notifications are allowed but not in progress */ - MAY_NOTIFY = 0, + MAY_NOTIFY, /** A notification is in progress */ NOTIFYING, /** Notifications are not allowed */ @@ -62,23 +48,23 @@ enum { NOTIFIED, }; -/** +/* * An object to be notified when the VDO enters read-only mode - **/ + */ struct read_only_listener { - /** The listener */ + /* The listener */ void *listener; - /** The method to call to notify the listener */ + /* The method to call to notify the listener */ vdo_read_only_notification *notify; - /** A pointer to the next listener */ + /* A pointer to the next listener */ struct read_only_listener *next; }; -/** +/* * Data associated with each base code thread. - **/ + */ struct thread_data { - /** + /* * Each thread maintains its own notion of whether the VDO is read-only * so that the read-only state can be checked from any base thread * without worrying about synchronization or thread safety. This does @@ -87,45 +73,52 @@ struct thread_data { * cause any problems. */ bool is_read_only; - /** + /* * A list of objects waiting to be notified on this thread that the VDO * has entered read-only mode. - **/ + */ struct read_only_listener *listeners; }; struct read_only_notifier { - /** The completion for entering read-only mode */ + /* The completion for entering read-only mode */ struct vdo_completion completion; - /** A completion waiting for notifications to be drained or enabled */ + /* A completion waiting for notifications to be drained or enabled */ struct vdo_completion *waiter; - /** The code of the error which put the VDO into read-only mode */ + /* The code of the error which put the VDO into read-only mode */ atomic_t read_only_error; - /** The current state of the notifier (values described above) */ + /* The current state of the notifier (values described above) */ atomic_t state; - /** The thread config of the VDO */ + /* The thread config of the VDO */ const struct thread_config *thread_config; - /** The array of per-thread data */ + /* The array of per-thread data */ struct thread_data thread_data[]; }; /** - * Convert a generic vdo_completion to a read_only_notifier. + * as_notifier() - Convert a generic vdo_completion to a read_only_notifier. + * @completion: The completion to convert. * - * @param completion The completion to convert - * - * @return The completion as a read_only_notifier - **/ + * Return: The completion as a read_only_notifier. + */ static inline struct read_only_notifier * as_notifier(struct vdo_completion *completion) { - assert_vdo_completion_type(completion->type, + vdo_assert_completion_type(completion->type, VDO_READ_ONLY_MODE_COMPLETION); return container_of(completion, struct read_only_notifier, completion); } -/**********************************************************************/ -int make_vdo_read_only_notifier(bool is_read_only, +/** + * vdo_make_read_only_notifier() - Create a read-only notifer. + * @is_read_only: Whether the VDO is already read-only. + * @thread_config: The thread configuration of the VDO. + * @vdo: The VDO. + * @notifier_ptr: A pointer to receive the new notifier. + * + * Return: VDO_SUCCESS or an error. + */ +int vdo_make_read_only_notifier(bool is_read_only, const struct thread_config *thread_config, struct vdo *vdo, struct read_only_notifier **notifier_ptr) @@ -133,7 +126,7 @@ int make_vdo_read_only_notifier(bool is_read_only, struct read_only_notifier *notifier; thread_count_t id; int result = UDS_ALLOCATE_EXTENDED(struct read_only_notifier, - thread_config->base_thread_count, + thread_config->thread_count, struct thread_data, __func__, ¬ifier); @@ -149,10 +142,10 @@ int make_vdo_read_only_notifier(bool is_read_only, atomic_set(¬ifier->state, MAY_NOT_NOTIFY); } - initialize_vdo_completion(¬ifier->completion, vdo, + vdo_initialize_completion(¬ifier->completion, vdo, VDO_READ_ONLY_MODE_COMPLETION); - for (id = 0; id < thread_config->base_thread_count; id++) { + for (id = 0; id < thread_config->thread_count; id++) { notifier->thread_data[id].is_read_only = is_read_only; } @@ -161,10 +154,10 @@ int make_vdo_read_only_notifier(bool is_read_only, } /** - * Free the list of read-only listeners associated with a thread. - * - * @param thread_data The thread holding the list to free - **/ + * free_listeners() - Free the list of read-only listeners associated + * with a thread. + * @thread_data: The thread holding the list to free. + */ static void free_listeners(struct thread_data *thread_data) { struct read_only_listener *listener, *next; @@ -177,8 +170,11 @@ static void free_listeners(struct thread_data *thread_data) } } -/**********************************************************************/ -void free_vdo_read_only_notifier(struct read_only_notifier *notifier) +/** + * vdo_free_read_only_notifier() - Free a read_only_notifier. + * @notifier: The notifier to free. + */ +void vdo_free_read_only_notifier(struct read_only_notifier *notifier) { thread_count_t id; @@ -186,7 +182,7 @@ void free_vdo_read_only_notifier(struct read_only_notifier *notifier) return; } - for (id = 0; id < notifier->thread_config->base_thread_count; id++) { + for (id = 0; id < notifier->thread_config->thread_count; id++) { free_listeners(¬ifier->thread_data[id]); } @@ -194,53 +190,71 @@ void free_vdo_read_only_notifier(struct read_only_notifier *notifier) } /** - * Check that a function was called on the admin thread. - * - * @param notifier The notifier - * @param caller The name of the function (for logging) - **/ -static void assert_on_admin_thread(struct read_only_notifier *notifier, - const char *caller) + * assert_notifier_on_admin_thread() - Check that a function was + * called on the admin thread. + * @notifier: The notifier. + * @caller: The name of the function (for logging). + */ +static void +assert_notifier_on_admin_thread(struct read_only_notifier *notifier, + const char *caller) { thread_id_t thread_id = vdo_get_callback_thread_id(); + ASSERT_LOG_ONLY((notifier->thread_config->admin_thread == thread_id), "%s called on admin thread", caller); } -/**********************************************************************/ +/** + * vdo_wait_until_not_entering_read_only_mode() - Wait until no read-only + * notifications are in + * progress and prevent any + * subsequent notifications. + * @notifier: The read-only notifier on which to wait. + * @parent: The completion to notify when no threads are entering + * read-only mode. + * + * Notifications may be re-enabled by calling + * vdo_allow_read_only_mode_entry(). + */ void vdo_wait_until_not_entering_read_only_mode(struct read_only_notifier *notifier, struct vdo_completion *parent) { int state; + if (notifier == NULL) { - finish_vdo_completion(parent, VDO_SUCCESS); + vdo_finish_completion(parent, VDO_SUCCESS); return; } - assert_on_admin_thread(notifier, __func__); + assert_notifier_on_admin_thread(notifier, __func__); if (notifier->waiter != NULL) { - finish_vdo_completion(parent, VDO_COMPONENT_BUSY); + vdo_finish_completion(parent, VDO_COMPONENT_BUSY); return; } - // Extra barriers because this was original developed using - // a CAS operation that implicitly had them. + /* + * Extra barriers because this was original developed using + * a CAS operation that implicitly had them. + */ smp_mb__before_atomic(); state = atomic_cmpxchg(¬ifier->state, MAY_NOTIFY, MAY_NOT_NOTIFY); smp_mb__after_atomic(); if ((state == MAY_NOT_NOTIFY) || (state == NOTIFIED)) { - // Notifications are already done or disallowed. - complete_vdo_completion(parent); + /* Notifications are already done or disallowed. */ + vdo_complete_completion(parent); return; } if (state == MAY_NOTIFY) { - // A notification was not in progress, and now they are - // disallowed. - complete_vdo_completion(parent); + /* + * A notification was not in progress, and now they are + * disallowed. + */ + vdo_complete_completion(parent); return; } @@ -253,55 +267,58 @@ void vdo_wait_until_not_entering_read_only_mode(struct read_only_notifier *notif } /** - * Complete the process of entering read only mode. - * - * @param completion The read-only mode completion - **/ + * finish_entering_read_only_mode() - Complete the process of entering read + * only mode. + * @completion: The read-only mode completion. + */ static void finish_entering_read_only_mode(struct vdo_completion *completion) { struct read_only_notifier *notifier = as_notifier(completion); struct vdo_completion *waiter = notifier->waiter; - assert_on_admin_thread(notifier, __func__); + assert_notifier_on_admin_thread(notifier, __func__); smp_wmb(); atomic_set(¬ifier->state, NOTIFIED); if (waiter != NULL) { notifier->waiter = NULL; - finish_vdo_completion(waiter, completion->result); + vdo_finish_completion(waiter, completion->result); } } /** - * Inform each thread that the VDO is in read-only mode. - * - * @param completion The read-only mode completion - **/ + * make_thread_read_only() - Inform each thread that the VDO is in read-only + * mode. + * @completion: The read-only mode completion. + */ static void make_thread_read_only(struct vdo_completion *completion) { thread_id_t thread_id = completion->callback_thread_id; struct read_only_notifier *notifier = as_notifier(completion); struct read_only_listener *listener = completion->parent; + if (listener == NULL) { - // This is the first call on this thread + /* This is the first call on this thread */ struct thread_data *thread_data = ¬ifier->thread_data[thread_id]; thread_data->is_read_only = true; listener = thread_data->listeners; if (thread_id == 0) { - // Note: This message must be recognizable by - // Permabit::UserMachine. + /* + * Note: This message must be recognizable by + * Permabit::UserMachine. + */ uds_log_error_strerror(atomic_read(¬ifier->read_only_error), "Unrecoverable error, entering read-only mode"); } } else { - // We've just finished notifying a listener + /* We've just finished notifying a listener */ listener = listener->next; } if (listener != NULL) { - // We have a listener to notify - prepare_vdo_completion(completion, + /* We have a listener to notify */ + vdo_prepare_completion(completion, make_thread_read_only, make_thread_read_only, thread_id, @@ -310,61 +327,86 @@ static void make_thread_read_only(struct vdo_completion *completion) return; } - // We're done with this thread - if (++thread_id >= notifier->thread_config->base_thread_count) { - // There are no more threads - prepare_vdo_completion(completion, + /* We're done with this thread */ + if (++thread_id == notifier->thread_config->dedupe_thread) { + /* + * We don't want to notify the dedupe thread since it may be + * blocked rebuilding the index. + */ + ++thread_id; + } + + if (thread_id >= notifier->thread_config->thread_count) { + /* There are no more threads */ + vdo_prepare_completion(completion, finish_entering_read_only_mode, finish_entering_read_only_mode, notifier->thread_config->admin_thread, NULL); } else { - prepare_vdo_completion(completion, + vdo_prepare_completion(completion, make_thread_read_only, make_thread_read_only, thread_id, NULL); } - invoke_vdo_completion_callback(completion); + vdo_invoke_completion_callback(completion); } -/**********************************************************************/ +/** + * vdo_allow_read_only_mode_entry() - Allow the notifier to put the VDO into + * read-only mode, reversing the effects of + * vdo_wait_until_not_entering_read_only_mode(). + * @notifier: The notifier. + * @parent: The object to notify once the operation is complete. + * + * If some thread tried to put the VDO into read-only mode while + * notifications were disallowed, it will be done when this method is called. + * If that happens, the parent will not be notified until the VDO has actually + * entered read-only mode and attempted to save the super block. + * + * Context: This method may only be called from the admin thread. + */ void vdo_allow_read_only_mode_entry(struct read_only_notifier *notifier, struct vdo_completion *parent) { int state; - assert_on_admin_thread(notifier, __func__); + assert_notifier_on_admin_thread(notifier, __func__); if (notifier->waiter != NULL) { - finish_vdo_completion(parent, VDO_COMPONENT_BUSY); + vdo_finish_completion(parent, VDO_COMPONENT_BUSY); return; } - // Extra barriers because this was original developed using - // a CAS operation that implicitly had them. + /* + * Extra barriers because this was original developed using + * a CAS operation that implicitly had them. + */ smp_mb__before_atomic(); state = atomic_cmpxchg(¬ifier->state, MAY_NOT_NOTIFY, MAY_NOTIFY); smp_mb__after_atomic(); if (state != MAY_NOT_NOTIFY) { - // Notifications were already allowed or complete. - complete_vdo_completion(parent); + /* Notifications were already allowed or complete. */ + vdo_complete_completion(parent); return; } if (atomic_read(¬ifier->read_only_error) == VDO_SUCCESS) { smp_rmb(); - // We're done - complete_vdo_completion(parent); + /* We're done */ + vdo_complete_completion(parent); return; } - // There may have been a pending notification + /* There may have been a pending notification */ - // Extra barriers because this was original developed using - // a CAS operation that implicitly had them. + /* + * Extra barriers because this was original developed using + * a CAS operation that implicitly had them. + */ smp_mb__before_atomic(); state = atomic_cmpxchg(¬ifier->state, MAY_NOTIFY, NOTIFYING); smp_mb__after_atomic(); @@ -376,16 +418,23 @@ void vdo_allow_read_only_mode_entry(struct read_only_notifier *notifier, * set the state to MAY_NOTIFY. It has already started the * notification. */ - complete_vdo_completion(parent); + vdo_complete_completion(parent); return; } - // Do the pending notification. + /* Do the pending notification. */ notifier->waiter = parent; make_thread_read_only(¬ifier->completion); } -/**********************************************************************/ +/** + * vdo_enter_read_only_mode() - Put a VDO into read-only mode and save the + * read-only state in the super block. + * @notifier: The read-only notifier of the VDO. + * @error_code: The error which caused the VDO to enter read-only mode. + * + * This method is a no-op if the VDO is already read-only. + */ void vdo_enter_read_only_mode(struct read_only_notifier *notifier, int error_code) { @@ -396,16 +445,18 @@ void vdo_enter_read_only_mode(struct read_only_notifier *notifier, if (thread_id != VDO_INVALID_THREAD_ID) { thread_data = ¬ifier->thread_data[thread_id]; if (thread_data->is_read_only) { - // This thread has already gone read-only. + /* This thread has already gone read-only. */ return; } - // Record for this thread that the VDO is read-only. + /* Record for this thread that the VDO is read-only. */ thread_data->is_read_only = true; } - // Extra barriers because this was original developed using a CAS - // operation that implicitly had them. + /* + * Extra barriers because this was original developed using a CAS + * operation that implicitly had them. + */ smp_mb__before_atomic(); state = atomic_cmpxchg(¬ifier->read_only_error, VDO_SUCCESS, @@ -413,48 +464,90 @@ void vdo_enter_read_only_mode(struct read_only_notifier *notifier, smp_mb__after_atomic(); if (state != VDO_SUCCESS) { - // The notifier is already aware of a read-only error + /* The notifier is already aware of a read-only error */ return; } state = atomic_cmpxchg(¬ifier->state, MAY_NOTIFY, NOTIFYING); - // Extra barrier because this was original developed using a CAS - // operation that implicitly had them. + /* + * Extra barrier because this was original developed using a CAS + * operation that implicitly had them. + */ smp_mb__after_atomic(); if (state != MAY_NOTIFY) { return; } - // Initiate a notification starting on the lowest numbered thread. - launch_vdo_completion_callback(¬ifier->completion, + /* Initiate a notification starting on the lowest numbered thread. */ + vdo_launch_completion_callback(¬ifier->completion, make_thread_read_only, 0); } -/**********************************************************************/ +/** + * vdo_is_read_only() - Check whether the VDO is read-only. + * @notifier: The read-only notifier of the VDO. + * + * This method may be called from any thread, as opposed to examining the + * VDO's state field which is only safe to check from the admin thread. + * + * Return: true if the VDO is read-only. + */ bool vdo_is_read_only(struct read_only_notifier *notifier) { return notifier->thread_data[vdo_get_callback_thread_id()].is_read_only; } -/**********************************************************************/ +/** + * vdo_is_or_will_be_read_only() - Check whether the VDO is or will be + * read-only. + * @notifier: The read-only notifier of the VDO. + * + * The VDO will be read-only if some thread has started the process of + * entering read-only mode, but not all threads have been notified yet. + * + * This method should only be called in cases where the expense of reading + * atomic state is not a problem. It was introduced in order to allow + * suppresion of spurious error messages resulting from VIO cleanup racing + * with read-only notification. + * + * Return: true if the VDO has started (and possibly finished) + * the process of entering read-only mode. + */ bool vdo_is_or_will_be_read_only(struct read_only_notifier *notifier) { return (atomic_read(¬ifier->read_only_error) != VDO_SUCCESS); } -/**********************************************************************/ -int register_vdo_read_only_listener(struct read_only_notifier *notifier, +/** + * vdo_register_read_only_listener() - Register a listener to be notified when + * the VDO goes read-only. + * @notifier: The notifier to register with. + * @listener: The object to notify. + * @notification: The function to call to send the notification. + * @thread_id: The id of the thread on which to send the notification. + * + * Return: VDO_SUCCESS or an error. + */ +int vdo_register_read_only_listener(struct read_only_notifier *notifier, void *listener, vdo_read_only_notification *notification, thread_id_t thread_id) { struct thread_data *thread_data = ¬ifier->thread_data[thread_id]; struct read_only_listener *read_only_listener; - int result = UDS_ALLOCATE(1, - struct read_only_listener, - __func__, - &read_only_listener); + int result; + + result = ASSERT(thread_id != notifier->thread_config->dedupe_thread, + "read only listener not registered on dedupe thread"); + if (result != VDO_SUCCESS) { + return result; + } + + result = UDS_ALLOCATE(1, + struct read_only_listener, + __func__, + &read_only_listener); if (result != VDO_SUCCESS) { return result; } diff --git a/vdo/read-only-notifier.h b/vdo/read-only-notifier.h new file mode 100644 index 00000000..dcad2407 --- /dev/null +++ b/vdo/read-only-notifier.h @@ -0,0 +1,58 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright Red Hat + */ + +/* + * A read_only_notifier is responsible for propogating the fact that the VDO + * has encountered an unrecoverable error to all base threads. It also persists + * the read-only state to the super block. + * + * The notifier also provides the ability to wait for any notifications to be + * complete in order to not cause super block write races when shutting down + * the VDO. + */ + +#ifndef READ_ONLY_NOTIFIER_H +#define READ_ONLY_NOTIFIER_H + +#include "completion.h" + +/** + * typedef vdo_read_only_notification - A function to notify a listener that + * the VDO has gone read-only. + * @listener: The object to notify. + * @parent: The completion to notify in order to acknowledge the notification. + */ +typedef void vdo_read_only_notification(void *listener, + struct vdo_completion *parent); + +int __must_check +vdo_make_read_only_notifier(bool is_read_only, + const struct thread_config *thread_config, + struct vdo *vdo, + struct read_only_notifier **notifier_ptr); + +void vdo_free_read_only_notifier(struct read_only_notifier *notifier); + +void +vdo_wait_until_not_entering_read_only_mode(struct read_only_notifier *notifier, + struct vdo_completion *parent); + +void vdo_allow_read_only_mode_entry(struct read_only_notifier *notifier, + struct vdo_completion *parent); + +void vdo_enter_read_only_mode(struct read_only_notifier *notifier, + int error_code); + +bool __must_check vdo_is_read_only(struct read_only_notifier *notifier); + +bool __must_check +vdo_is_or_will_be_read_only(struct read_only_notifier *notifier); + +int vdo_register_read_only_listener(struct read_only_notifier *notifier, + void *listener, + vdo_read_only_notification *notification, + thread_id_t thread_id); + +#endif /* READ_ONLY_NOTIFIER_H */ diff --git a/vdo/readOnlyRebuild.c b/vdo/read-only-rebuild.c similarity index 55% rename from vdo/readOnlyRebuild.c rename to vdo/read-only-rebuild.c index 30ee2f53..3466c499 100644 --- a/vdo/readOnlyRebuild.c +++ b/vdo/read-only-rebuild.c @@ -1,42 +1,27 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/readOnlyRebuild.c#34 $ */ -#include "readOnlyRebuild.h" +#include "read-only-rebuild.h" #include "logger.h" -#include "memoryAlloc.h" +#include "memory-alloc.h" -#include "blockMapInternals.h" -#include "blockMapRecovery.h" +#include "block-map.h" +#include "block-map-recovery.h" #include "completion.h" -#include "numUtils.h" -#include "packedRecoveryJournalBlock.h" -#include "recoveryJournalInternals.h" -#include "recoveryUtils.h" -#include "referenceCountRebuild.h" -#include "slabDepot.h" -#include "vdoComponent.h" -#include "vdoComponentStates.h" -#include "vdoInternal.h" -#include "vdoPageCache.h" +#include "num-utils.h" +#include "packed-recovery-journal-block.h" +#include "recovery-journal.h" +#include "recovery-utils.h" +#include "reference-count-rebuild.h" +#include "slab-depot.h" +#include "thread-config.h" +#include "vdo.h" +#include "vdo-component.h" +#include "vdo-component-states.h" +#include "vdo-page-cache.h" struct read_only_rebuild_completion { /** The completion header */ @@ -68,25 +53,25 @@ struct read_only_rebuild_completion { }; /** - * Convert a generic completion to a read_only_rebuild_completion. - * - * @param completion The completion to convert + * as_read_only_rebuild_completion() - Convert a generic completion to + * a read_only_rebuild_completion. + * @completion: The completion to convert. * - * @return the journal rebuild completion - **/ + * Return: The journal rebuild completion. + */ static inline struct read_only_rebuild_completion * __must_check as_read_only_rebuild_completion(struct vdo_completion *completion) { - assert_vdo_completion_type(completion->type, + vdo_assert_completion_type(completion->type, VDO_READ_ONLY_REBUILD_COMPLETION); return container_of(completion, struct read_only_rebuild_completion, completion); } /** - * Free a rebuild completion and all underlying structures. - * - * @param rebuild The rebuild completion to free + * free_rebuild_completion() - Free a rebuild completion and all underlying + * structures. + * @rebuild: The rebuild completion to free. */ static void free_rebuild_completion(struct read_only_rebuild_completion *rebuild) @@ -101,13 +86,13 @@ free_rebuild_completion(struct read_only_rebuild_completion *rebuild) } /** - * Allocate and initialize a read only rebuild completion. + * make_rebuild_completion() - Allocate and initialize a read only rebuild + * completion. + * @vdo: The vdo in question. + * @rebuild_ptr: A pointer to return the created rebuild completion. * - * @param [in] vdo The vdo in question - * @param [out] rebuild_ptr A pointer to return the created rebuild completion - * - * @return VDO_SUCCESS or an error code - **/ + * Return: VDO_SUCCESS or an error code. + */ static int make_rebuild_completion(struct vdo *vdo, struct read_only_rebuild_completion **rebuild_ptr) @@ -119,9 +104,9 @@ make_rebuild_completion(struct vdo *vdo, return result; } - initialize_vdo_completion(&rebuild->completion, vdo, + vdo_initialize_completion(&rebuild->completion, vdo, VDO_READ_ONLY_REBUILD_COMPLETION); - initialize_vdo_completion(&rebuild->sub_task_completion, vdo, + vdo_initialize_completion(&rebuild->sub_task_completion, vdo, VDO_SUB_TASK_COMPLETION); rebuild->vdo = vdo; @@ -130,11 +115,12 @@ make_rebuild_completion(struct vdo *vdo, } /** - * Clean up the rebuild process, whether or not it succeeded, by freeing the - * rebuild completion and notifying the parent of the outcome. + * complete_rebuild() - Clean up the rebuild process. + * @completion: The rebuild completion. * - * @param completion The rebuild completion - **/ + * Cleans up the rebuild process, whether or not it succeeded, by freeing the + * rebuild completion and notifying the parent of the outcome. + */ static void complete_rebuild(struct vdo_completion *completion) { struct vdo_completion *parent = completion->parent; @@ -143,22 +129,23 @@ static void complete_rebuild(struct vdo_completion *completion) as_read_only_rebuild_completion(UDS_FORGET(completion)); struct block_map *block_map = rebuild->vdo->block_map; - set_vdo_page_cache_rebuild_mode(block_map->zones[0].page_cache, false); + vdo_set_page_cache_rebuild_mode(block_map->zones[0].page_cache, false); free_rebuild_completion(UDS_FORGET(rebuild)); - finish_vdo_completion(parent, result); + vdo_finish_completion(parent, result); } /** - * Finish rebuilding, free the rebuild completion and notify the parent. - * - * @param completion The rebuild completion - **/ + * finish_rebuild() - Finish rebuilding, free the rebuild completion and + * notify the parent. + * @completion: The rebuild completion. + */ static void finish_rebuild(struct vdo_completion *completion) { struct read_only_rebuild_completion *rebuild = as_read_only_rebuild_completion(completion); struct vdo *vdo = rebuild->vdo; - initialize_vdo_recovery_journal_post_rebuild(vdo->recovery_journal, + + vdo_initialize_recovery_journal_post_rebuild(vdo->recovery_journal, vdo->states.vdo.complete_recoveries, rebuild->tail, rebuild->logical_blocks_used, @@ -168,10 +155,9 @@ static void finish_rebuild(struct vdo_completion *completion) } /** - * Handle a rebuild error. - * - * @param completion The rebuild completion - **/ + * abort_rebuild() - Handle a rebuild error. + * @completion: The rebuild completion. + */ static void abort_rebuild(struct vdo_completion *completion) { uds_log_info("Read-only rebuild aborted"); @@ -179,13 +165,12 @@ static void abort_rebuild(struct vdo_completion *completion) } /** - * Abort a rebuild if there is an error. - * - * @param result The result to check - * @param rebuild The journal rebuild completion + * abort_rebuild_on_error() - Abort a rebuild if there is an error. + * @result: The result to check. + * @rebuild: The journal rebuild completion. * - * @return true if the result was an error - **/ + * Return: true if the result was an error. + */ static bool __must_check abort_rebuild_on_error(int result, struct read_only_rebuild_completion *rebuild) @@ -194,91 +179,98 @@ abort_rebuild_on_error(int result, return false; } - finish_vdo_completion(&rebuild->completion, result); + vdo_finish_completion(&rebuild->completion, result); return true; } /** - * Clean up after finishing the reference count rebuild. This callback is - * registered in launch_reference_count_rebuild(). + * finish_reference_count_rebuild() - Clean up after finishing the reference + * count rebuild. + * @completion: The sub-task completion. * - * @param completion The sub-task completion - **/ + * This callback is registered in launch_reference_count_rebuild(). + */ static void finish_reference_count_rebuild(struct vdo_completion *completion) { struct read_only_rebuild_completion *rebuild = completion->parent; struct vdo *vdo = rebuild->vdo; - assert_on_admin_thread(vdo, __func__); + + vdo_assert_on_admin_thread(vdo, __func__); if (vdo->load_state != VDO_REBUILD_FOR_UPGRADE) { - // A "rebuild" for upgrade should not increment this count. + /* A "rebuild" for upgrade should not increment this count. */ vdo->states.vdo.complete_recoveries++; } uds_log_info("Saving rebuilt state"); - prepare_vdo_completion_to_finish_parent(completion, &rebuild->completion); - drain_vdo_slab_depot(vdo->depot, VDO_ADMIN_STATE_REBUILDING, completion); + vdo_prepare_completion_to_finish_parent(completion, &rebuild->completion); + vdo_drain_slab_depot(vdo->depot, VDO_ADMIN_STATE_REBUILDING, completion); } /** - * Rebuild the reference counts from the block map now that all journal entries - * have been applied to the block map. This callback is registered in - * apply_journal_entries(). + * launch_reference_count_rebuild() - Rebuild the reference counts from the + * block map now that all journal entries + * have been applied to the block map. + * @completion: The sub-task completion. * - * @param completion The sub-task completion - **/ + * This callback is registered in apply_journal_entries(). + */ static void launch_reference_count_rebuild(struct vdo_completion *completion) { struct read_only_rebuild_completion *rebuild = completion->parent; struct vdo *vdo = rebuild->vdo; - // We must allocate ref_counts before we can rebuild them. + /* We must allocate ref_counts before we can rebuild them. */ int result = vdo_allocate_slab_ref_counts(vdo->depot); + if (abort_rebuild_on_error(result, rebuild)) { return; } - prepare_vdo_completion(completion, + vdo_prepare_completion(completion, finish_reference_count_rebuild, - finish_vdo_completion_parent_callback, + vdo_finish_completion_parent_callback, vdo->thread_config->admin_thread, completion->parent); - rebuild_vdo_reference_counts(vdo, + vdo_rebuild_reference_counts(vdo, completion, &rebuild->logical_blocks_used, &rebuild->block_map_data_blocks); } /** - * Append an array of recovery journal entries from a journal block sector to - * the array of numbered mappings in the rebuild completion, numbering each - * entry in the order they are appended. - * - * @param rebuild The journal rebuild completion - * @param sector The recovery journal sector with entries - * @param entry_count The number of entries to append - **/ + * append_sector_entries() - Append an array of recovery journal entries from + * a journal block sector to the array of numbered + * mappings in the rebuild completion, numbering + * each entry in the order they are appended. + * @rebuild: The journal rebuild completion. + * @sector: The recovery journal sector with entries. + * @entry_count: The number of entries to append. + */ static void append_sector_entries(struct read_only_rebuild_completion *rebuild, struct packed_journal_sector *sector, journal_entry_count_t entry_count) { journal_entry_count_t i; + for (i = 0; i < entry_count; i++) { struct recovery_journal_entry entry = - unpack_vdo_recovery_journal_entry(§or->entries[i]); - int result = validate_vdo_recovery_journal_entry(rebuild->vdo, + vdo_unpack_recovery_journal_entry(§or->entries[i]); + int result = vdo_validate_recovery_journal_entry(rebuild->vdo, &entry); if (result != VDO_SUCCESS) { - // When recovering from read-only mode, ignore damaged - // entries. + /* + * When recovering from read-only mode, ignore damaged + * entries. + */ continue; } - if (is_vdo_journal_increment_operation(entry.operation)) { + if (vdo_is_journal_increment_operation(entry.operation)) { rebuild->entries[rebuild->entry_count] = (struct numbered_block_mapping) { .block_map_slot = entry.slot, .block_map_entry = - pack_vdo_pbn(entry.mapping.pbn, + vdo_pack_pbn(entry.mapping.pbn, entry.mapping.state), .number = rebuild->entry_count, }; @@ -288,13 +280,13 @@ static void append_sector_entries(struct read_only_rebuild_completion *rebuild, } /** - * Create an array of all valid journal entries, in order, and store - * it in the rebuild completion. - * - * @param rebuild The journal rebuild completion + * extract_journal_entries() - Create an array of all valid journal entries, + * in order, and store it in the rebuild + * completion. + * @rebuild: The journal rebuild completion. * - * @return VDO_SUCCESS or an error code - **/ + * Return: VDO_SUCCESS or an error code. + */ static int extract_journal_entries(struct read_only_rebuild_completion *rebuild) { sequence_number_t i; @@ -321,53 +313,61 @@ static int extract_journal_entries(struct read_only_rebuild_completion *rebuild) for (i = first; i <= last; i++) { struct packed_journal_header *packed_header = - get_vdo_recovery_journal_block_header(journal, + vdo_get_recovery_journal_block_header(journal, rebuild->journal_data, i); struct recovery_block_header header; journal_entry_count_t block_entries; uint8_t j; - unpack_vdo_recovery_block_header(packed_header, &header); + vdo_unpack_recovery_block_header(packed_header, &header); - if (!is_exact_vdo_recovery_journal_block(journal, &header, i)) { - // This block is invalid, so skip it. + if (!vdo_is_exact_recovery_journal_block(journal, &header, i)) { + /* This block is invalid, so skip it. */ continue; } - // Don't extract more than the expected maximum entries per - // block. + /* + * Don't extract more than the expected maximum entries per + * block. + */ block_entries = min(journal->entries_per_block, header.entry_count); for (j = 1; j < VDO_SECTORS_PER_BLOCK; j++) { journal_entry_count_t sector_entries; struct packed_journal_sector *sector = - get_vdo_journal_block_sector(packed_header, j); - // Stop when all entries counted in the header are - // applied or skipped. + vdo_get_journal_block_sector(packed_header, j); + /* + * Stop when all entries counted in the header are + * applied or skipped. + */ if (block_entries == 0) { break; } - if (!is_valid_vdo_recovery_journal_sector(&header, sector)) { + if (!vdo_is_valid_recovery_journal_sector(&header, sector)) { block_entries -= min(block_entries, (journal_entry_count_t) RECOVERY_JOURNAL_ENTRIES_PER_SECTOR); continue; } - // Don't extract more than the expected maximum entries - // per sector. + /* + * Don't extract more than the expected maximum entries + * per sector. + */ sector_entries = min(sector->entry_count, (uint8_t) RECOVERY_JOURNAL_ENTRIES_PER_SECTOR); - // Only extract as many as the block header calls for. + /* Only extract as many as the block header calls for. */ sector_entries = min(sector_entries, block_entries); append_sector_entries(rebuild, sector, sector_entries); - // Even if the sector wasn't full, count it as full - // when counting up to the entry count the block - // header claims. + /* + * Even if the sector wasn't full, count it as full + * when counting up to the entry count the block + * header claims. + */ block_entries -= min(block_entries, (journal_entry_count_t) RECOVERY_JOURNAL_ENTRIES_PER_SECTOR); @@ -378,12 +378,13 @@ static int extract_journal_entries(struct read_only_rebuild_completion *rebuild) } /** - * Determine the limits of the valid recovery journal and apply all - * valid entries to the block map. This callback is registered in - * load_journal_callback(). + * apply_journal_entries() - Determine the limits of the valid recovery + * journal and apply all valid entries to the block + * map. + * @completion: The sub-task completion. * - * @param completion The sub-task completion - **/ + * This callback is registered in load_journal_callback(). + */ static void apply_journal_entries(struct vdo_completion *completion) { bool found_entries; @@ -393,64 +394,74 @@ static void apply_journal_entries(struct vdo_completion *completion) struct vdo *vdo = rebuild->vdo; uds_log_info("Finished reading recovery journal"); - assert_on_logical_zone_thread(vdo, 0, __func__); + vdo_assert_on_logical_zone_thread(vdo, 0, __func__); found_entries = - find_vdo_recovery_journal_head_and_tail(vdo->recovery_journal, + vdo_find_recovery_journal_head_and_tail(vdo->recovery_journal, rebuild->journal_data, &rebuild->tail, &rebuild->head, NULL); if (found_entries) { int result = extract_journal_entries(rebuild); + if (abort_rebuild_on_error(result, rebuild)) { return; } } - // Suppress block map errors. - set_vdo_page_cache_rebuild_mode(get_block_map(vdo)->zones[0].page_cache, + /* Suppress block map errors. */ + vdo_set_page_cache_rebuild_mode(vdo->block_map->zones[0].page_cache, true); - // Play the recovery journal into the block map. - prepare_vdo_completion(completion, + /* Play the recovery journal into the block map. */ + vdo_prepare_completion(completion, launch_reference_count_rebuild, - finish_vdo_completion_parent_callback, + vdo_finish_completion_parent_callback, completion->callback_thread_id, completion->parent); - recover_vdo_block_map(vdo, rebuild->entry_count, rebuild->entries, + vdo_recover_block_map(vdo, rebuild->entry_count, rebuild->entries, completion); } /** - * Begin loading the journal. - * - * @param completion The sub task completion - **/ + * load_journal_callback() - Begin loading the journal. + * @completion: The sub task completion. + */ static void load_journal_callback(struct vdo_completion *completion) { struct read_only_rebuild_completion *rebuild = as_read_only_rebuild_completion(completion->parent); struct vdo *vdo = rebuild->vdo; - assert_on_logical_zone_thread(vdo, 0, __func__); - prepare_vdo_completion(completion, + vdo_assert_on_logical_zone_thread(vdo, 0, __func__); + + vdo_prepare_completion(completion, apply_journal_entries, - finish_vdo_completion_parent_callback, + vdo_finish_completion_parent_callback, completion->callback_thread_id, completion->parent); - load_vdo_recovery_journal(vdo->recovery_journal, completion, + vdo_load_recovery_journal(vdo->recovery_journal, completion, &rebuild->journal_data); } -/**********************************************************************/ -void launch_vdo_rebuild(struct vdo *vdo, struct vdo_completion *parent) +/** + * vdo_launch_rebuild() - Construct a read_only_rebuild_completion and launch + * it. + * @vdo: The vdo to rebuild. + * @parent: The completion to notify when the rebuild is complete. + * + * Apply all valid journal block entries to all vdo structures. + * + * Context: Must be launched from logical zone 0. + */ +void vdo_launch_rebuild(struct vdo *vdo, struct vdo_completion *parent) { struct read_only_rebuild_completion *rebuild; struct vdo_completion *completion, *sub_task_completion; int result; - // Note: These messages must be recognizable by Permabit::VDODeviceBase. + /* Note: These messages must be recognizable by Permabit::VDODeviceBase. */ if (vdo->load_state == VDO_REBUILD_FOR_UPGRADE) { uds_log_warning("Rebuilding reference counts for upgrade"); } else { @@ -460,24 +471,24 @@ void launch_vdo_rebuild(struct vdo *vdo, struct vdo_completion *parent) result = make_rebuild_completion(vdo, &rebuild); if (result != VDO_SUCCESS) { - finish_vdo_completion(parent, result); + vdo_finish_completion(parent, result); return; } completion = &rebuild->completion; - prepare_vdo_completion(completion, + vdo_prepare_completion(completion, finish_rebuild, abort_rebuild, parent->callback_thread_id, parent); sub_task_completion = &rebuild->sub_task_completion; - prepare_vdo_completion(sub_task_completion, + vdo_prepare_completion(sub_task_completion, load_journal_callback, - finish_vdo_completion_parent_callback, + vdo_finish_completion_parent_callback, vdo_get_logical_zone_thread(vdo->thread_config, 0), completion); - load_vdo_slab_depot(vdo->depot, + vdo_load_slab_depot(vdo->depot, VDO_ADMIN_STATE_LOADING_FOR_REBUILD, sub_task_completion, NULL); diff --git a/vdo/read-only-rebuild.h b/vdo/read-only-rebuild.h new file mode 100644 index 00000000..e2b84220 --- /dev/null +++ b/vdo/read-only-rebuild.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright Red Hat + */ + +#ifndef READ_ONLY_REBUILD_H +#define READ_ONLY_REBUILD_H + +#include "completion.h" +#include "vdo.h" + +void vdo_launch_rebuild(struct vdo *vdo, struct vdo_completion *parent); + +#endif /* READ_ONLY_REBUILD_H */ diff --git a/vdo/readOnlyNotifier.h b/vdo/readOnlyNotifier.h deleted file mode 100644 index bcfbdcbe..00000000 --- a/vdo/readOnlyNotifier.h +++ /dev/null @@ -1,152 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/readOnlyNotifier.h#7 $ - */ - -/* - * A read_only_notifier is responsible for propogating the fact that the VDO - * has encountered an unrecoverable error to all base threads. It also persists - * the read-only state to the super block. - * - * The notifier also provides the ability to wait for any notifications to be - * complete in order to not cause super block write races when shutting down - * the VDO. - */ - -#ifndef READ_ONLY_NOTIFIER_H -#define READ_ONLY_NOTIFIER_H - -#include "completion.h" - -/** - * A function to notify a listener that the VDO has gone read-only. - * - * @param listener The object to notify - * @param parent The completion to notify in order to acknowledge the - * notification - **/ -typedef void vdo_read_only_notification(void *listener, - struct vdo_completion *parent); - -/** - * Create a read-only notifer. - * - * @param [in] is_read_only Whether the VDO is already read-only - * @param [in] thread_config The thread configuration of the VDO - * @param [in] vdo The VDO - * @param [out] notifier_ptr A pointer to receive the new notifier - * - * @return VDO_SUCCESS or an error - **/ -int __must_check -make_vdo_read_only_notifier(bool is_read_only, - const struct thread_config *thread_config, - struct vdo *vdo, - struct read_only_notifier **notifier_ptr); - -/** - * Free a read_only_notifier. - * - * @param notifier The notifier to free - **/ -void free_vdo_read_only_notifier(struct read_only_notifier *notifier); - -/** - * Wait until no read-only notifications are in progress and prevent any - * subsequent notifications. Notifications may be re-enabled by calling - * vdo_allow_read_only_mode_entry(). - * - * @param notifier The read-only notifier on which to wait - * @param parent The completion to notify when no threads are entering - * read-only mode - **/ -void -vdo_wait_until_not_entering_read_only_mode(struct read_only_notifier *notifier, - struct vdo_completion *parent); - -/** - * Allow the notifier to put the VDO into read-only mode, reversing the effects - * of vdo_wait_until_not_entering_read_only_mode(). If some thread tried to put - * the VDO into read-only mode while notifications were disallowed, it will be - * done when this method is called. If that happens, the parent will not be - * notified until the VDO has actually entered read-only mode and attempted to - * save the super block. - * - *

This method may only be called from the admin thread. - * - * @param notifier The notifier - * @param parent The object to notify once the operation is complete - **/ -void vdo_allow_read_only_mode_entry(struct read_only_notifier *notifier, - struct vdo_completion *parent); - -/** - * Put a VDO into read-only mode and save the read-only state in the super - * block. This method is a no-op if the VDO is already read-only. - * - * @param notifier The read-only notifier of the VDO - * @param error_code The error which caused the VDO to enter read-only - * mode - **/ -void vdo_enter_read_only_mode(struct read_only_notifier *notifier, - int error_code); - -/** - * Check whether the VDO is read-only. This method may be called from any - * thread, as opposed to examining the VDO's state field which is only safe - * to check from the admin thread. - * - * @param notifier The read-only notifier of the VDO - * - * @return true if the VDO is read-only - **/ -bool __must_check vdo_is_read_only(struct read_only_notifier *notifier); - -/** - * Check whether the VDO is or will be read-only (i.e. some thread has started - * the process of entering read-only mode, but not all threads have been - * notified yet). This method should only be called in cases where the expense - * of reading atomic state is not a problem. It was introduced in order to allow - * suppresion of spurious error messages resulting from VIO cleanup racing with - * read-only notification. - * - * @param notifier The read-only notifier of the VDO - * - * @return true if the VDO has started (and possibly finished) - * the process of entering read-only mode - **/ -bool __must_check -vdo_is_or_will_be_read_only(struct read_only_notifier *notifier); - -/** - * Register a listener to be notified when the VDO goes read-only. - * - * @param notifier The notifier to register with - * @param listener The object to notify - * @param notification The function to call to send the notification - * @param thread_id The id of the thread on which to send the notification - * - * @return VDO_SUCCESS or an error - **/ -int register_vdo_read_only_listener(struct read_only_notifier *notifier, - void *listener, - vdo_read_only_notification *notification, - thread_id_t thread_id); - -#endif /* READ_ONLY_NOTIFIER_H */ diff --git a/vdo/readOnlyRebuild.h b/vdo/readOnlyRebuild.h deleted file mode 100644 index fd526feb..00000000 --- a/vdo/readOnlyRebuild.h +++ /dev/null @@ -1,38 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/readOnlyRebuild.h#7 $ - */ - -#ifndef READ_ONLY_REBUILD_H -#define READ_ONLY_REBUILD_H - -#include "completion.h" -#include "vdo.h" - -/** - * Construct a read_only_rebuild_completion and launch it. Apply all valid - * journal block entries to all vdo structures. Must be launched from logical - * zone 0. - * - * @param vdo The vdo to rebuild - * @param parent The completion to notify when the rebuild is complete - **/ -void launch_vdo_rebuild(struct vdo *vdo, struct vdo_completion *parent); - -#endif // READ_ONLY_REBUILD_H diff --git a/uds/recordPage.c b/vdo/record-page.c similarity index 56% rename from uds/recordPage.c rename to vdo/record-page.c index cd74ad34..be3f4a77 100644 --- a/uds/recordPage.c +++ b/vdo/record-page.c @@ -1,29 +1,12 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/krusty/src/uds/recordPage.c#15 $ */ -#include "recordPage.h" +#include "record-page.h" #include "permassert.h" -/**********************************************************************/ static unsigned int encode_tree(byte record_page[], const struct uds_chunk_record *sorted_pointers[], @@ -33,14 +16,17 @@ encode_tree(byte record_page[], { if (node < node_count) { unsigned int child = (2 * node) + 1; + next_record = encode_tree(record_page, sorted_pointers, next_record, child, node_count); - // In-order traversal: copy the contents of the next record - // into the page at the node offset. + /* + * In-order traversal: copy the contents of the next record + * into the page at the node offset. + */ memcpy(&record_page[node * BYTES_PER_RECORD], sorted_pointers[next_record], BYTES_PER_RECORD); @@ -55,7 +41,6 @@ encode_tree(byte record_page[], return next_record; } -/**********************************************************************/ int encode_record_page(const struct volume *volume, const struct uds_chunk_record records[], byte record_page[]) @@ -65,10 +50,13 @@ int encode_record_page(const struct volume *volume, const struct uds_chunk_record **record_pointers = volume->record_pointers; - // Build an array of record pointers. We'll sort the pointers by the - // block names in the records, which is less work than sorting the - // record values. + /* + * Build an array of record pointers. We'll sort the pointers by the + * block names in the records, which is less work than sorting the + * record values. + */ unsigned int i; + for (i = 0; i < records_per_page; i++) { record_pointers[i] = &records[i]; } @@ -82,36 +70,43 @@ int encode_record_page(const struct volume *volume, return result; } - // Use the sorted pointers to copy the records from the chapter to the - // record page in tree order. + /* + * Use the sorted pointers to copy the records from the chapter to the + * record page in tree order. + */ encode_tree(record_page, record_pointers, 0, 0, records_per_page); return UDS_SUCCESS; } -/**********************************************************************/ bool search_record_page(const byte record_page[], const struct uds_chunk_name *name, const struct geometry *geometry, struct uds_chunk_data *metadata) { - // The record page is just an array of chunk records. + /* The record page is just an array of chunk records. */ const struct uds_chunk_record *records = (const struct uds_chunk_record *) record_page; - // The array of records is sorted by name and stored as a binary tree - // in heap order, so the root of the tree is the first array element. + /* + * The array of records is sorted by name and stored as a binary tree + * in heap order, so the root of the tree is the first array element. + */ unsigned int node = 0; + while (node < geometry->records_per_page) { const struct uds_chunk_record *record = &records[node]; int result = memcmp(name, &record->name, UDS_CHUNK_NAME_SIZE); + if (result == 0) { if (metadata != NULL) { *metadata = record->data; } return true; } - // The children of node N are in the heap at indexes 2N+1 and - // 2N+2. + /* + * The children of node N are in the heap at indexes 2N+1 and + * 2N+2. + */ node = ((2 * node) + ((result < 0) ? 1 : 2)); } return false; diff --git a/uds/recordPage.h b/vdo/record-page.h similarity index 58% rename from uds/recordPage.h rename to vdo/record-page.h index b1dce9a7..1342f593 100644 --- a/uds/recordPage.h +++ b/vdo/record-page.h @@ -1,22 +1,6 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/krusty/src/uds/recordPage.h#9 $ */ #ifndef RECORDPAGE_H diff --git a/vdo/recoveryJournalBlock.c b/vdo/recovery-journal-block.c similarity index 54% rename from vdo/recoveryJournalBlock.c rename to vdo/recovery-journal-block.c index 6af69043..9ee93887 100644 --- a/vdo/recoveryJournalBlock.c +++ b/vdo/recovery-journal-block.c @@ -1,64 +1,62 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/recoveryJournalBlock.c#28 $ */ -#include "recoveryJournalBlock.h" +#include "recovery-journal-block.h" + +#include #include "logger.h" -#include "memoryAlloc.h" +#include "memory-alloc.h" #include "permassert.h" -#include "dataVIO.h" -#include "fixedLayout.h" -#include "packedRecoveryJournalBlock.h" -#include "recoveryJournalEntry.h" -#include "recoveryJournalInternals.h" +#include "data-vio.h" +#include "io-submitter.h" +#include "packed-recovery-journal-block.h" +#include "recovery-journal-entry.h" +#include "recovery-journal.h" +#include "vdo-layout.h" #include "vio.h" -#include "waitQueue.h" +#include "wait-queue.h" -/**********************************************************************/ -int make_vdo_recovery_block(struct vdo *vdo, +/** + * vdo_make_recovery_block() - Construct a journal block. + * @vdo: The vdo from which to construct vios. + * @journal: The journal to which the block will belong. + * @block_ptr: A pointer to receive the new block. + * + * Return: VDO_SUCCESS or an error. + */ +int vdo_make_recovery_block(struct vdo *vdo, struct recovery_journal *journal, struct recovery_journal_block **block_ptr) { struct recovery_journal_block *block; int result; - // Ensure that a block is large enough to store - // RECOVERY_JOURNAL_ENTRIES_PER_BLOCK entries. + /* + * Ensure that a block is large enough to store + * RECOVERY_JOURNAL_ENTRIES_PER_BLOCK entries. + */ STATIC_ASSERT(RECOVERY_JOURNAL_ENTRIES_PER_BLOCK - <= ((VDO_BLOCK_SIZE - - sizeof(struct packed_journal_header)) - / sizeof(struct packed_recovery_journal_entry))); + <= ((VDO_BLOCK_SIZE - + sizeof(struct packed_journal_header)) / + sizeof(struct packed_recovery_journal_entry))); result = UDS_ALLOCATE(1, struct recovery_journal_block, __func__, &block); if (result != VDO_SUCCESS) { return result; } - // Allocate a full block for the journal block even though not all of - // the space is used since the VIO needs to write a full disk block. + /* + * Allocate a full block for the journal block even though not all of + * the space is used since the VIO needs to write a full disk block. + */ result = UDS_ALLOCATE(VDO_BLOCK_SIZE, char, "PackedJournalBlock", &block->block); if (result != VDO_SUCCESS) { - free_vdo_recovery_block(block); + vdo_free_recovery_block(block); return result; } @@ -69,7 +67,7 @@ int make_vdo_recovery_block(struct vdo *vdo, block->block, &block->vio); if (result != VDO_SUCCESS) { - free_vdo_recovery_block(block); + vdo_free_recovery_block(block); return result; } @@ -81,8 +79,11 @@ int make_vdo_recovery_block(struct vdo *vdo, return VDO_SUCCESS; } -/**********************************************************************/ -void free_vdo_recovery_block(struct recovery_journal_block *block) +/** + * vdo_free_recovery_block() - Free a tail block. + * @block: The tail block to free. + */ +void vdo_free_recovery_block(struct recovery_journal_block *block) { if (block == NULL) { return; @@ -94,12 +95,12 @@ void free_vdo_recovery_block(struct recovery_journal_block *block) } /** - * Get a pointer to the packed journal block header in the block buffer. + * get_block_header() - Get a pointer to the packed journal block + * header in the block buffer. + * @block: The recovery block. * - * @param block The recovery block - * - * @return The block's header - **/ + * Return: The block's header. + */ static inline struct packed_journal_header * get_block_header(const struct recovery_journal_block *block) { @@ -107,11 +108,11 @@ get_block_header(const struct recovery_journal_block *block) } /** - * Set the current sector of the current block and initialize it. - * - * @param block The block to update - * @param sector A pointer to the first byte of the new sector - **/ + * set_active_sector() - Set the current sector of the current block + * and initialize it. + * @block: The block to update. + * @sector: A pointer to the first byte of the new sector. + */ static void set_active_sector(struct recovery_journal_block *block, void *sector) { @@ -121,8 +122,12 @@ static void set_active_sector(struct recovery_journal_block *block, block->sector->entry_count = 0; } -/**********************************************************************/ -void initialize_vdo_recovery_block(struct recovery_journal_block *block) +/** + * vdo_initialize_recovery_block() - Initialize the next active + * recovery journal block. + * @block: The journal block to initialize. + */ +void vdo_initialize_recovery_block(struct recovery_journal_block *block) { struct recovery_journal *journal = block->journal; struct recovery_block_header unpacked = { @@ -132,7 +137,7 @@ void initialize_vdo_recovery_block(struct recovery_journal_block *block) .nonce = journal->nonce, .recovery_count = journal->recovery_count, .sequence_number = journal->tail, - .check_byte = compute_vdo_recovery_journal_check_byte(journal, + .check_byte = vdo_compute_recovery_journal_check_byte(journal, journal->tail), }; struct packed_journal_header *header = get_block_header(block); @@ -143,24 +148,40 @@ void initialize_vdo_recovery_block(struct recovery_journal_block *block) block->uncommitted_entry_count = 0; block->block_number = - get_vdo_recovery_journal_block_number(journal, journal->tail); + vdo_get_recovery_journal_block_number(journal, journal->tail); - pack_vdo_recovery_block_header(&unpacked, header); + vdo_pack_recovery_block_header(&unpacked, header); - set_active_sector(block, get_vdo_journal_block_sector(header, 1)); + set_active_sector(block, vdo_get_journal_block_sector(header, 1)); } -/**********************************************************************/ -int enqueue_vdo_recovery_block_entry(struct recovery_journal_block *block, +/** + * vdo_enqueue_recovery_block_entry() - Enqueue a data_vio to + * asynchronously encode and + * commit its next recovery + * journal entry in this block. + * @block: The journal block in which to make an entry. + * @data_vio: The data_vio to enqueue. + * + * The data_vio will not be continued until the entry is committed to + * the on-disk journal. The caller is responsible for ensuring the + * block is not already full. + * + * Return: VDO_SUCCESS or an error code if the data_vio could not be enqueued. + */ +int vdo_enqueue_recovery_block_entry(struct recovery_journal_block *block, struct data_vio *data_vio) { - // First queued entry indicates this is a journal block we've just - // opened or a committing block we're extending and will have to write - // again. + /* + * First queued entry indicates this is a journal block we've just + * opened or a committing block we're extending and will have to write + * again. + */ bool new_batch = !has_waiters(&block->entry_waiters); - // Enqueue the data_vio to wait for its entry to commit. + /* Enqueue the data_vio to wait for its entry to commit. */ int result = enqueue_data_vio(&block->entry_waiters, data_vio); + if (result != VDO_SUCCESS) { return result; } @@ -168,7 +189,7 @@ int enqueue_vdo_recovery_block_entry(struct recovery_journal_block *block, block->entry_count++; block->uncommitted_entry_count++; - // Update stats to reflect the journal entry we're going to write. + /* Update stats to reflect the journal entry we're going to write. */ if (new_batch) { block->journal->events.blocks.started++; } @@ -178,12 +199,11 @@ int enqueue_vdo_recovery_block_entry(struct recovery_journal_block *block, } /** - * Check whether the current sector of a block is full. - * - * @param block The block to check + * is_sector_full() - * Check whether the current sector of a block is full. + * @block: The block to check. * - * @return true if the sector is full - **/ + * Return: true if the sector is full. + */ static bool __must_check is_sector_full(const struct recovery_journal_block *block) { @@ -192,12 +212,12 @@ is_sector_full(const struct recovery_journal_block *block) } /** - * Actually add entries from the queue to the given block. - * - * @param block The journal block + * add_queued_recovery_entries() - Actually add entries from the queue to the + * given block. + * @block: The journal block. * - * @return VDO_SUCCESS or an error code - **/ + * Return: VDO_SUCCESS or an error code. + */ static int __must_check add_queued_recovery_entries(struct recovery_journal_block *block) { @@ -209,7 +229,7 @@ add_queued_recovery_entries(struct recovery_journal_block *block) struct recovery_journal_entry new_entry; int result; - if (data_vio->operation.type == DATA_INCREMENT) { + if (data_vio->operation.type == VDO_JOURNAL_DATA_INCREMENT) { /* * In order to not lose an acknowledged write with the * FUA flag, we must also set the FUA flag on the @@ -217,29 +237,28 @@ add_queued_recovery_entries(struct recovery_journal_block *block) */ block->has_fua_entry = (block->has_fua_entry || - vio_requires_flush_after(data_vio_as_vio(data_vio))); + data_vio_requires_fua(data_vio)); } - // Compose and encode the entry. + /* Compose and encode the entry. */ packed_entry = &block->sector->entries[block->sector->entry_count++]; new_entry = (struct recovery_journal_entry) { - .mapping = - { + .mapping = { .pbn = data_vio->operation.pbn, .state = data_vio->operation.state, }, .operation = data_vio->operation.type, .slot = lock->tree_slots[lock->height].block_map_slot, }; - *packed_entry = pack_vdo_recovery_journal_entry(&new_entry); + *packed_entry = vdo_pack_recovery_journal_entry(&new_entry); - if (is_vdo_journal_increment_operation(data_vio->operation.type)) { + if (vdo_is_journal_increment_operation(data_vio->operation.type)) { data_vio->recovery_sequence_number = block->sequence_number; } - // Enqueue the data_vio to wait for its entry to commit. + /* Enqueue the data_vio to wait for its entry to commit. */ result = enqueue_data_vio(&block->commit_waiters, data_vio); if (result != VDO_SUCCESS) { continue_data_vio(data_vio, result); @@ -255,7 +274,6 @@ add_queued_recovery_entries(struct recovery_journal_block *block) return VDO_SUCCESS; } -/**********************************************************************/ static int __must_check get_recovery_block_pbn(struct recovery_journal_block *block, physical_block_number_t *pbn_ptr) @@ -272,29 +290,50 @@ get_recovery_block_pbn(struct recovery_journal_block *block, return result; } -/**********************************************************************/ -bool can_commit_vdo_recovery_block(struct recovery_journal_block *block) +/** + * vdo_can_commit_recovery_block() - Check whether a journal block can be + * committed. + * @block: The journal block in question. + * + * Return: true if the block can be committed now. + */ +bool vdo_can_commit_recovery_block(struct recovery_journal_block *block) { - // Cannot commit in read-only mode, if already committing the block, - // or if there are no entries to commit. + /* + * Cannot commit in read-only mode, if already committing the block, + * or if there are no entries to commit. + */ return ((block != NULL) && !block->committing && has_waiters(&block->entry_waiters) && !vdo_is_read_only(block->journal->read_only_notifier)); } -/**********************************************************************/ -int commit_vdo_recovery_block(struct recovery_journal_block *block, - vdo_action *callback, +/** + * vdo_commit_recovery_block() - Attempt to commit a block. + * @block: The block to write. + * @callback: The function to call when the write completes. + * @error_handler: The handler for flush or write errors. + * + * If the block is not the oldest block with uncommitted entries or if it is + * already being committed, nothing will be done. + * + * Return: VDO_SUCCESS, or an error if the write could not be launched. + */ +int vdo_commit_recovery_block(struct recovery_journal_block *block, + bio_end_io_t callback, vdo_action *error_handler) { + int result; + physical_block_number_t block_pbn; struct recovery_journal *journal = block->journal; struct packed_journal_header *header = get_block_header(block); - physical_block_number_t block_pbn; - bool fua; - int result = ASSERT(can_commit_vdo_recovery_block(block), - "should never call %s when the block can't be committed", - __func__); + unsigned int operation = + REQ_OP_WRITE | REQ_PRIO | REQ_PREFLUSH | REQ_SYNC; + + result = ASSERT(vdo_can_commit_recovery_block(block), + "block can commit in %s", + __func__); if (result != VDO_SUCCESS) { return result; } @@ -310,7 +349,7 @@ int commit_vdo_recovery_block(struct recovery_journal_block *block, return result; } - // Update stats to reflect the block and entries we're about to write. + /* Update stats to reflect the block and entries we're about to write. */ journal->pending_write_count += 1; journal->events.blocks.written += 1; journal->events.entries.written += block->entries_in_commit; @@ -329,19 +368,24 @@ int commit_vdo_recovery_block(struct recovery_journal_block *block, * data. For writes which had the FUA flag set, we must also set the * FUA flag on the journal write. */ - fua = block->has_fua_entry; - block->has_fua_entry = false; - launch_write_metadata_vio_with_flush(block->vio, - block_pbn, - callback, - error_handler, - true, - fua); + if (block->has_fua_entry) { + block->has_fua_entry = false; + operation |= REQ_FUA; + } + submit_metadata_vio(block->vio, + block_pbn, + callback, + error_handler, + operation); return VDO_SUCCESS; } -/**********************************************************************/ -void dump_vdo_recovery_block(const struct recovery_journal_block *block) +/** + * vdo_dump_recovery_block() - Dump the contents of the recovery block to the + * log. + * @block: The block to dump. + */ +void vdo_dump_recovery_block(const struct recovery_journal_block *block) { uds_log_info(" sequence number %llu; entries %u; %s; %zu entry waiters; %zu commit waiters", (unsigned long long) block->sequence_number, diff --git a/vdo/recovery-journal-block.h b/vdo/recovery-journal-block.h new file mode 100644 index 00000000..b2b28ff2 --- /dev/null +++ b/vdo/recovery-journal-block.h @@ -0,0 +1,128 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright Red Hat + */ + +#ifndef RECOVERY_JOURNAL_BLOCK_H +#define RECOVERY_JOURNAL_BLOCK_H + +#include "permassert.h" + +#include + +#include "packed-recovery-journal-block.h" +#include "recovery-journal.h" +#include "types.h" +#include "wait-queue.h" + +struct recovery_journal_block { + /* The doubly linked pointers for the free or active lists */ + struct list_head list_node; + /* The waiter for the pending full block list */ + struct waiter write_waiter; + /* The journal to which this block belongs */ + struct recovery_journal *journal; + /* A pointer to a block-sized buffer holding the packed block data */ + char *block; + /* A pointer to the current sector in the packed block buffer */ + struct packed_journal_sector *sector; + /* The vio for writing this block */ + struct vio *vio; + /* The sequence number for this block */ + sequence_number_t sequence_number; + /* The location of this block in the on-disk journal */ + physical_block_number_t block_number; + /* Whether this block is being committed */ + bool committing; + /* + * Whether this block has an uncommitted increment for a write with FUA + */ + bool has_fua_entry; + /* The total number of entries in this block */ + journal_entry_count_t entry_count; + /* The total number of uncommitted entries (queued or committing) */ + journal_entry_count_t uncommitted_entry_count; + /* The number of new entries in the current commit */ + journal_entry_count_t entries_in_commit; + /* The queue of vios which will make entries for the next commit */ + struct wait_queue entry_waiters; + /* The queue of vios waiting for the current commit */ + struct wait_queue commit_waiters; +}; + +/** + * vdo_recovery_block_from_list_entry() - Return the block associated with a + * list entry. + * @entry: The list entry to recast as a block. + * + * Return: The block. + **/ +static inline struct recovery_journal_block * +vdo_recovery_block_from_list_entry(struct list_head *entry) +{ + return list_entry(entry, struct recovery_journal_block, list_node); +} + +/** + * vdo_is_recovery_block_dirty() - Check whether a recovery block is dirty. + * @block: The block to check. + * + * Indicates it has any uncommitted entries, which includes both entries not + * written and entries written but not yet acknowledged. + * + * Return: true if the block has any uncommitted entries. + **/ +static inline bool __must_check +vdo_is_recovery_block_dirty(const struct recovery_journal_block *block) +{ + return (block->uncommitted_entry_count > 0); +} + +/** + * vdo_is_recovery_block_empty() - Check whether a journal block is empty. + * @block: The block to check. + * + * Return: true if the block has no entries. + **/ +static inline bool __must_check +vdo_is_recovery_block_empty(const struct recovery_journal_block *block) +{ + return (block->entry_count == 0); +} + +/** + * vdo_is_recovery_block_full() - Check whether a journal block is full. + * @block: The block to check. + * + * Return: true if the block is full. + **/ +static inline bool __must_check +vdo_is_recovery_block_full(const struct recovery_journal_block *block) +{ + return ((block == NULL) + || (block->journal->entries_per_block == block->entry_count)); +} + +int __must_check +vdo_make_recovery_block(struct vdo *vdo, + struct recovery_journal *journal, + struct recovery_journal_block **block_ptr); + +void vdo_free_recovery_block(struct recovery_journal_block *block); + +void vdo_initialize_recovery_block(struct recovery_journal_block *block); + +int __must_check +vdo_enqueue_recovery_block_entry(struct recovery_journal_block *block, + struct data_vio *data_vio); + +int __must_check vdo_commit_recovery_block(struct recovery_journal_block *block, + bio_end_io_t callback, + vdo_action *error_handler); + +void vdo_dump_recovery_block(const struct recovery_journal_block *block); + +bool __must_check +vdo_can_commit_recovery_block(struct recovery_journal_block *block); + +#endif /* RECOVERY_JOURNAL_BLOCK_H */ diff --git a/vdo/recoveryJournalEntry.h b/vdo/recovery-journal-entry.h similarity index 58% rename from vdo/recoveryJournalEntry.h rename to vdo/recovery-journal-entry.h index 0312d043..390201af 100644 --- a/vdo/recoveryJournalEntry.h +++ b/vdo/recovery-journal-entry.h @@ -1,22 +1,6 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/recoveryJournalEntry.h#9 $ */ #ifndef RECOVERY_JOURNAL_ENTRY_H @@ -24,34 +8,34 @@ #include "numeric.h" -#include "blockMapEntry.h" -#include "journalPoint.h" +#include "block-map-entry.h" +#include "journal-point.h" #include "types.h" -/** +/* * A recovery journal entry stores two physical locations: a data location * that is the value of a single mapping in the block map tree, and the - * location of the block map page and and slot that is either acquiring or + * location of the block map page and slot that is either acquiring or * releasing a reference to the data location. The journal entry also stores * an operation code that says whether the reference is being acquired (an * increment) or released (a decrement), and whether the mapping is for a * logical block or for the block map tree itself. - **/ + */ struct recovery_journal_entry { struct block_map_slot slot; struct data_location mapping; enum journal_operation operation; }; -/** The packed, on-disk representation of a recovery journal entry. */ +/* The packed, on-disk representation of a recovery journal entry. */ struct packed_recovery_journal_entry { - /** + /* * In little-endian bit order: * Bits 15..12: The four highest bits of the 36-bit physical * block number of the block map tree page Bits 11..2: The * 10-bit block map page slot number Bits 1..0: The 2-bit * journal_operation of the entry - **/ + */ #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ unsigned operation : 2; unsigned slot_low : 6; @@ -64,28 +48,29 @@ struct packed_recovery_journal_entry { unsigned slot_high : 4; #endif - /** + /* * Bits 47..16: The 32 low-order bits of the block map page * PBN, in little-endian byte order - **/ + */ __le32 pbn_low_word; - /** + /* * Bits 87..48: The five-byte block map entry encoding the * location that was or will be stored in the block map page slot - **/ + */ struct block_map_entry block_map_entry; } __packed; /** - * Return the packed, on-disk representation of a recovery journal entry. + * vdo_pack_recovery_journal_entry() - Return the packed, on-disk + * representation of a recovery journal + * entry. + * @entry: The journal entry to pack. * - * @param entry The journal entry to pack - * - * @return The packed representation of the journal entry - **/ + * Return: The packed representation of the journal entry. + */ static inline struct packed_recovery_journal_entry -pack_vdo_recovery_journal_entry(const struct recovery_journal_entry *entry) +vdo_pack_recovery_journal_entry(const struct recovery_journal_entry *entry) { return (struct packed_recovery_journal_entry) { .operation = entry->operation, @@ -93,33 +78,33 @@ pack_vdo_recovery_journal_entry(const struct recovery_journal_entry *entry) .slot_high = (entry->slot.slot >> 6) & 0x0F, .pbn_high_nibble = (entry->slot.pbn >> 32) & 0x0F, .pbn_low_word = __cpu_to_le32(entry->slot.pbn & UINT_MAX), - .block_map_entry = pack_vdo_pbn(entry->mapping.pbn, + .block_map_entry = vdo_pack_pbn(entry->mapping.pbn, entry->mapping.state), }; } /** - * Unpack the on-disk representation of a recovery journal entry. - * - * @param entry The recovery journal entry to unpack + * vdo_unpack_recovery_journal_entry() - Unpack the on-disk representation of + * a recovery journal entry. + * @entry: The recovery journal entry to unpack. * - * @return The unpacked entry - **/ + * Return: The unpacked entry. + */ static inline struct recovery_journal_entry -unpack_vdo_recovery_journal_entry(const struct packed_recovery_journal_entry *entry) +vdo_unpack_recovery_journal_entry(const struct packed_recovery_journal_entry *entry) { physical_block_number_t low32 = __le32_to_cpu(entry->pbn_low_word); physical_block_number_t high4 = entry->pbn_high_nibble; + return (struct recovery_journal_entry) { .operation = entry->operation, - .slot = - { + .slot = { .pbn = ((high4 << 32) | low32), .slot = (entry->slot_low | (entry->slot_high << 6)), }, - .mapping = unpack_vdo_block_map_entry(&entry->block_map_entry), + .mapping = vdo_unpack_block_map_entry(&entry->block_map_entry), }; } -#endif // RECOVERY_JOURNAL_ENTRY_H +#endif /* RECOVERY_JOURNAL_ENTRY_H */ diff --git a/vdo/recoveryJournalFormat.c b/vdo/recovery-journal-format.c similarity index 60% rename from vdo/recoveryJournalFormat.c rename to vdo/recovery-journal-format.c index 0b03b2c9..b20ed718 100644 --- a/vdo/recoveryJournalFormat.c +++ b/vdo/recovery-journal-format.c @@ -1,57 +1,52 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/recoveryJournalFormat.c#13 $ */ -#include "recoveryJournalFormat.h" +#include "recovery-journal-format.h" #include "buffer.h" #include "permassert.h" -#include "fixedLayout.h" #include "header.h" -#include "statusCodes.h" +#include "status-codes.h" #include "types.h" const struct header VDO_RECOVERY_JOURNAL_HEADER_7_0 = { .id = VDO_RECOVERY_JOURNAL, - .version = - { + .version = { .major_version = 7, .minor_version = 0, }, .size = sizeof(struct recovery_journal_state_7_0), }; -/**********************************************************************/ -size_t get_vdo_recovery_journal_encoded_size(void) +/** + * vdo_get_recovery_journal_encoded_size() - Get the size of the encoded state + * of a recovery journal. + * + * Return: the encoded size of the journal's state. + */ +size_t vdo_get_recovery_journal_encoded_size(void) { return VDO_ENCODED_HEADER_SIZE + sizeof(struct recovery_journal_state_7_0); } -/**********************************************************************/ -int encode_vdo_recovery_journal_state_7_0(struct recovery_journal_state_7_0 state, +/** + * vdo_encode_recovery_journal_state_7_0() - Encode the state of a recovery + * journal. + * @state: The recovery journal state. + * @buffer: The buffer to encode into. + * + * Return: VDO_SUCCESS or an error code. + */ +int vdo_encode_recovery_journal_state_7_0(struct recovery_journal_state_7_0 state, struct buffer *buffer) { size_t initial_length, encoded_size; - int result = encode_vdo_header(&VDO_RECOVERY_JOURNAL_HEADER_7_0, buffer); + int result = vdo_encode_header(&VDO_RECOVERY_JOURNAL_HEADER_7_0, buffer); + if (result != UDS_SUCCESS) { return result; } @@ -79,9 +74,17 @@ int encode_vdo_recovery_journal_state_7_0(struct recovery_journal_state_7_0 stat "encoded recovery journal component size must match header size"); } -/**********************************************************************/ +/** + * vdo_decode_recovery_journal_state_7_0() - Decode the state of a recovery + * journal saved in a buffer. + * @buffer: The buffer containing the saved state. + * @state: A pointer to a recovery journal state to hold the result of a + * succesful decode. + * + * Return: VDO_SUCCESS or an error code. + */ int -decode_vdo_recovery_journal_state_7_0(struct buffer *buffer, +vdo_decode_recovery_journal_state_7_0(struct buffer *buffer, struct recovery_journal_state_7_0 *state) { struct header header; @@ -90,12 +93,12 @@ decode_vdo_recovery_journal_state_7_0(struct buffer *buffer, sequence_number_t journal_start; block_count_t logical_blocks_used, block_map_data_blocks; - result = decode_vdo_header(buffer, &header); + result = vdo_decode_header(buffer, &header); if (result != VDO_SUCCESS) { return result; } - result = validate_vdo_header(&VDO_RECOVERY_JOURNAL_HEADER_7_0, &header, + result = vdo_validate_header(&VDO_RECOVERY_JOURNAL_HEADER_7_0, &header, true, __func__); if (result != VDO_SUCCESS) { return result; @@ -134,20 +137,25 @@ decode_vdo_recovery_journal_state_7_0(struct buffer *buffer, return VDO_SUCCESS; } -/**********************************************************************/ -const char *get_vdo_journal_operation_name(enum journal_operation operation) +/** + * vdo_get_journal_operation_name() - Get the name of a journal operation. + * @operation: The operation to name. + * + * Return: The name of the operation. + */ +const char *vdo_get_journal_operation_name(enum journal_operation operation) { switch (operation) { - case DATA_DECREMENT: + case VDO_JOURNAL_DATA_DECREMENT: return "data decrement"; - case DATA_INCREMENT: + case VDO_JOURNAL_DATA_INCREMENT: return "data increment"; - case BLOCK_MAP_DECREMENT: + case VDO_JOURNAL_BLOCK_MAP_DECREMENT: return "block map decrement"; - case BLOCK_MAP_INCREMENT: + case VDO_JOURNAL_BLOCK_MAP_INCREMENT: return "block map increment"; default: diff --git a/vdo/recovery-journal-format.h b/vdo/recovery-journal-format.h new file mode 100644 index 00000000..e57d813b --- /dev/null +++ b/vdo/recovery-journal-format.h @@ -0,0 +1,82 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright Red Hat + */ + +#ifndef RECOVERY_JOURNAL_FORMAT_H +#define RECOVERY_JOURNAL_FORMAT_H + +#include "buffer.h" + +#include "header.h" +#include "packed-recovery-journal-block.h" +#include "types.h" + +/* + * The state of the recovery journal as encoded in the VDO super block. + */ +struct recovery_journal_state_7_0 { + /** Sequence number to start the journal */ + sequence_number_t journal_start; + /** Number of logical blocks used by VDO */ + block_count_t logical_blocks_used; + /** Number of block map pages allocated */ + block_count_t block_map_data_blocks; +} __packed; + +extern const struct header VDO_RECOVERY_JOURNAL_HEADER_7_0; + +size_t __must_check vdo_get_recovery_journal_encoded_size(void); + +int __must_check +vdo_encode_recovery_journal_state_7_0(struct recovery_journal_state_7_0 state, + struct buffer *buffer); + +int __must_check +vdo_decode_recovery_journal_state_7_0(struct buffer *buffer, + struct recovery_journal_state_7_0 *state); + +const char * __must_check +vdo_get_journal_operation_name(enum journal_operation operation); + +/** + * vdo_is_valid_recovery_journal_sector() - Determine whether the header of + * the given sector could describe a + * valid sector for the given journal + * block header. + * @header: The unpacked block header to compare against. + * @sector: The packed sector to check. + * + * Return: true if the sector matches the block header. + */ +static inline bool __must_check +vdo_is_valid_recovery_journal_sector(const struct recovery_block_header *header, + const struct packed_journal_sector *sector) +{ + return ((header->check_byte == sector->check_byte) + && (header->recovery_count == sector->recovery_count)); +} + +/** + * vdo_compute_recovery_journal_block_number() - Compute the physical block + * number of the recovery + * journal block which would + * have a given sequence number. + * @journal_size: The size of the journal. + * @sequence_number: The sequence number. + * + * Return: The pbn of the journal block which would the specified sequence + * number. + */ +static inline physical_block_number_t __must_check +vdo_compute_recovery_journal_block_number(block_count_t journal_size, + sequence_number_t sequence_number) +{ + /* + * Since journal size is a power of two, the block number modulus can + * just be extracted from the low-order bits of the sequence. + */ + return (sequence_number & (journal_size - 1)); +} + +#endif /* RECOVERY_JOURNAL_FORMAT_H */ diff --git a/vdo/recoveryJournal.c b/vdo/recovery-journal.c similarity index 53% rename from vdo/recoveryJournal.c rename to vdo/recovery-journal.c index 86b80395..d45656f2 100644 --- a/vdo/recoveryJournal.c +++ b/vdo/recovery-journal.c @@ -1,45 +1,30 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/recoveryJournal.c#36 $ */ -#include "recoveryJournal.h" -#include "recoveryJournalInternals.h" +#include "recovery-journal.h" + +#include #include "logger.h" -#include "memoryAlloc.h" +#include "memory-alloc.h" #include "permassert.h" -#include "blockMap.h" +#include "block-map.h" #include "constants.h" -#include "dataVIO.h" -#include "extent.h" +#include "data-vio.h" #include "header.h" -#include "numUtils.h" -#include "packedRecoveryJournalBlock.h" -#include "recoveryJournalBlock.h" -#include "recoveryJournalFormat.h" -#include "slabDepot.h" -#include "slabJournal.h" +#include "io-submitter.h" +#include "num-utils.h" +#include "packed-recovery-journal-block.h" +#include "recovery-journal-block.h" +#include "recovery-journal-format.h" +#include "slab-depot.h" +#include "slab-journal.h" #include "vdo.h" -#include "vdoInternal.h" -#include "waitQueue.h" +#include "vio.h" +#include "wait-queue.h" static const uint64_t RECOVERY_COUNT_MASK = 0xff; @@ -54,16 +39,16 @@ enum { }; /** - * Get a block from the end of the free list. + * pop_free_list() - Get a block from the end of the free list. + * @journal: The journal. * - * @param journal The journal - * - * @return The block or NULL if the list is empty - **/ + * Return: The block or NULL if the list is empty. + */ static struct recovery_journal_block * pop_free_list(struct recovery_journal *journal) { struct list_head *entry; + if (list_empty(&journal->free_tail_blocks)) { return NULL; } @@ -73,16 +58,16 @@ pop_free_list(struct recovery_journal *journal) } /** - * Get a block from the end of the active list. + * pop_active_list() - Get a block from the end of the active list. + * @journal: The journal. * - * @param journal The journal - * - * @return The block or NULL if the list is empty - **/ + * Return: The block or NULL if the list is empty. + */ static struct recovery_journal_block * pop_active_list(struct recovery_journal *journal) { struct list_head *entry; + if (list_empty(&journal->active_tail_blocks)) { return NULL; } @@ -92,11 +77,11 @@ pop_active_list(struct recovery_journal *journal) } /** - * Assert that we are running on the journal thread. - * - * @param journal The journal - * @param function_name The function doing the check (for logging) - **/ + * assert_on_journal_thread() - Assert that we are running on the journal + * thread. + * @journal: The journal. + * @function_name: The function doing the check (for logging). + */ static void assert_on_journal_thread(struct recovery_journal *journal, const char *function_name) { @@ -105,10 +90,12 @@ static void assert_on_journal_thread(struct recovery_journal *journal, } /** - * waiter_callback implementation invoked whenever a data_vio is to be released - * from the journal, either because its entry was committed to disk, - * or because there was an error. - **/ + * continue_waiter() - Release a data_vio from the journal. + * + * Invoked whenever a data_vio is to be released from the journal, either + * because its entry was committed to disk, or because there was an error. + * Implements waiter_callback. + */ static void continue_waiter(struct waiter *waiter, void *context) { struct data_vio *data_vio = waiter_as_data_vio(waiter); @@ -118,18 +105,20 @@ static void continue_waiter(struct waiter *waiter, void *context) } /** - * Check whether the journal has any waiters on any blocks. - * - * @param journal The journal in question + * has_block_waiters() - Check whether the journal has any waiters on any + * blocks. + * @journal: The journal in question. * - * @return true if any block has a waiter - **/ + * Return: true if any block has a waiter. + */ static inline bool has_block_waiters(struct recovery_journal *journal) { struct recovery_journal_block *block; - // Either the first active tail block (if it exists) has waiters, - // or no active tail block has waiters. + /* + * Either the first active tail block (if it exists) has waiters, + * or no active tail block has waiters. + */ if (list_empty(&journal->active_tail_blocks)) { return false; } @@ -139,19 +128,18 @@ static inline bool has_block_waiters(struct recovery_journal *journal) || has_waiters(&block->commit_waiters)); } -/**********************************************************************/ static void recycle_journal_blocks(struct recovery_journal *journal); static void recycle_journal_block(struct recovery_journal_block *block); static void notify_commit_waiters(struct recovery_journal *journal); /** - * Check whether the journal has drained. - * - * @param journal The journal which may have just drained - **/ -static void vdo_check_for_drain_complete(struct recovery_journal *journal) + * check_for_drain_complete() - Check whether the journal has drained. + * @journal: The journal which may have just drained. + */ +static void check_for_drain_complete(struct recovery_journal *journal) { int result = VDO_SUCCESS; + if (vdo_is_read_only(journal->read_only_notifier)) { result = VDO_READ_ONLY; /* @@ -164,25 +152,25 @@ static void vdo_check_for_drain_complete(struct recovery_journal *journal) notify_commit_waiters(journal); recycle_journal_blocks(journal); - // Release any data_vios waiting to be assigned entries. + /* Release any data_vios waiting to be assigned entries. */ notify_all_waiters(&journal->decrement_waiters, continue_waiter, &result); notify_all_waiters(&journal->increment_waiters, continue_waiter, &result); } - if (!is_vdo_state_draining(&journal->state) || journal->reaping + if (!vdo_is_state_draining(&journal->state) || journal->reaping || has_block_waiters(journal) || has_waiters(&journal->increment_waiters) || has_waiters(&journal->decrement_waiters) - || !suspend_vdo_lock_counter(journal->lock_counter)) { + || !vdo_suspend_lock_counter(journal->lock_counter)) { return; } - if (is_vdo_state_saving(&journal->state)) { + if (vdo_is_state_saving(&journal->state)) { if (journal->active_block != NULL) { ASSERT_LOG_ONLY(((result == VDO_READ_ONLY) - || !is_vdo_recovery_block_dirty(journal->active_block)), + || !vdo_is_recovery_block_dirty(journal->active_block)), "journal being saved has clean active block"); recycle_journal_block(journal->active_block); } @@ -191,56 +179,66 @@ static void vdo_check_for_drain_complete(struct recovery_journal *journal) "all blocks in a journal being saved must be inactive"); } - finish_vdo_draining_with_result(&journal->state, result); + vdo_finish_draining_with_result(&journal->state, result); } /** - * Notifiy a recovery journal that the VDO has gone read-only. - * - *

Implements vdo_read_only_notification. + * notify_recovery_journal_of_read_only_mode() - Notify a recovery journal + * that the VDO has gone + * read-only. + * @listener: The journal. + * @parent: The completion to notify in order to acknowledge the notification. * - * @param listener The journal - * @param parent The completion to notify in order to acknowledge the - * notification - **/ + * Implements vdo_read_only_notification. + */ static void notify_recovery_journal_of_read_only_mode(void *listener, struct vdo_completion *parent) { - vdo_check_for_drain_complete(listener); - complete_vdo_completion(parent); + check_for_drain_complete(listener); + vdo_complete_completion(parent); } /** - * Put the journal in read-only mode. All attempts to add entries after - * this function is called will fail. All VIOs waiting for commits will be - * awakened with an error. + * enter_journal_read_only_mode() - Put the journal in read-only mode. + * @journal: The journal which has failed. + * @error_code: The error result triggering this call. * - * @param journal The journal which has failed - * @param error_code The error result triggering this call - **/ + * All attempts to add entries after this function is called will fail. All + * VIOs waiting for commits will be awakened with an error. + */ static void enter_journal_read_only_mode(struct recovery_journal *journal, int error_code) { vdo_enter_read_only_mode(journal->read_only_notifier, error_code); - vdo_check_for_drain_complete(journal); + check_for_drain_complete(journal); } -/**********************************************************************/ +/** + * vdo_get_recovery_journal_current_sequence_number() - Obtain the recovery + * journal's current + * sequence number. + * @journal: The journal in question. + * + * Exposed only so the block map can be initialized therefrom. + * + * Return: The sequence number of the tail block. + */ sequence_number_t -get_vdo_recovery_journal_current_sequence_number(struct recovery_journal *journal) +vdo_get_recovery_journal_current_sequence_number(struct recovery_journal *journal) { return journal->tail; } /** - * Get the head of the recovery journal, which is the lowest sequence number of - * the block map head and the slab journal head. + * get_recovery_journal_head() - Get the head of the recovery journal. + * @journal: The journal. * - * @param journal The journal + * The head is the lowest sequence number of the block map head and the slab + * journal head. * - * @return the head of the journal - **/ + * Return: the head of the journal. + */ static inline sequence_number_t get_recovery_journal_head(const struct recovery_journal *journal) { @@ -248,12 +246,12 @@ get_recovery_journal_head(const struct recovery_journal *journal) } /** - * Compute the recovery count byte for a given recovery count. - * - * @param recovery_count The recovery count + * compute_recovery_count_byte() - Compute the recovery count byte for a given + * recovery count. + * @recovery_count: The recovery count. * - * @return The byte corresponding to the recovery count - **/ + * Return: The byte corresponding to the recovery count. + */ static inline uint8_t __must_check compute_recovery_count_byte(uint64_t recovery_count) { @@ -261,11 +259,12 @@ compute_recovery_count_byte(uint64_t recovery_count) } /** - * Check whether the journal is over the threshold, and if so, force the oldest - * slab journal tail block to commit. - * - * @param journal The journal - **/ + * check_slab_journal_commit_threshold() - Check whether the journal is over + * the threshold, and if so, force the + * oldest slab journal tail block to + * commit. + * @journal: The journal. + */ static void check_slab_journal_commit_threshold(struct recovery_journal *journal) { @@ -278,19 +277,18 @@ check_slab_journal_commit_threshold(struct recovery_journal *journal) } } -/**********************************************************************/ static void reap_recovery_journal(struct recovery_journal *journal); static void assign_entries(struct recovery_journal *journal); /** - * Finish reaping the journal. - * - * @param journal The journal being reaped - **/ + * finish_reaping() - Finish reaping the journal. + * @journal: The journal being reaped. + */ static void finish_reaping(struct recovery_journal *journal) { block_count_t blocks_reaped; sequence_number_t old_head = get_recovery_journal_head(journal); + journal->block_map_head = journal->block_map_reap_head; journal->slab_journal_head = journal->slab_journal_reap_head; blocks_reaped = get_recovery_journal_head(journal) - old_head; @@ -298,43 +296,58 @@ static void finish_reaping(struct recovery_journal *journal) journal->reaping = false; check_slab_journal_commit_threshold(journal); assign_entries(journal); - vdo_check_for_drain_complete(journal); + check_for_drain_complete(journal); } /** - * Finish reaping the journal after flushing the lower layer. This is the - * callback registered in reap_recovery_journal(). + * complete_reaping() - Finish reaping the journal after flushing the lower + * layer. + * @completion: The journal's flush VIO. * - * @param completion The journal's flush VIO - **/ + * This is the callback registered in reap_recovery_journal(). + */ static void complete_reaping(struct vdo_completion *completion) { struct recovery_journal *journal = completion->parent; + finish_reaping(journal); - // Try reaping again in case more locks were released while flush was - // out. + /* + * Try reaping again in case more locks were released while flush was + * out. + */ reap_recovery_journal(journal); } /** - * Handle an error when flushing the lower layer due to reaping. - * - * @param completion The journal's flush VIO - **/ + * handle_flush_error() - Handle an error when flushing the lower layer due to + * reaping. + * @completion: The journal's flush VIO. + */ static void handle_flush_error(struct vdo_completion *completion) { struct recovery_journal *journal = completion->parent; + + record_metadata_io_error(as_vio(completion)); journal->reaping = false; enter_journal_read_only_mode(journal, completion->result); } +static void flush_endio(struct bio *bio) +{ + struct vio *vio = bio->bi_private; + struct recovery_journal *journal = vio->completion.parent; + + continue_vio_after_io(vio, + complete_reaping, + journal->thread_id); +} + /** - * Set all journal fields appropriately to start journaling from the current - * active block. - * - * @param journal The journal to be reset based on its active block - **/ + * initialize_journal_state() - Set all journal fields appropriately to start + * journaling from the current active block. + * @journal: The journal to be reset based on its active block. + */ static void initialize_journal_state(struct recovery_journal *journal) { journal->append_point.sequence_number = journal->tail; @@ -344,17 +357,27 @@ static void initialize_journal_state(struct recovery_journal *journal) journal->block_map_reap_head = journal->tail; journal->slab_journal_reap_head = journal->tail; journal->block_map_head_block_number = - get_vdo_recovery_journal_block_number(journal, + vdo_get_recovery_journal_block_number(journal, journal->block_map_head); journal->slab_journal_head_block_number = - get_vdo_recovery_journal_block_number(journal, + vdo_get_recovery_journal_block_number(journal, journal->slab_journal_head); + journal->available_space = + (journal->entries_per_block * + vdo_get_recovery_journal_length(journal->size)); } -/**********************************************************************/ -block_count_t get_vdo_recovery_journal_length(block_count_t journal_size) +/** + * vdo_get_recovery_journal_length() - Get the number of usable recovery + * journal blocks. + * @journal_size: The size of the recovery journal in blocks. + * + * Return: the number of recovery journal blocks usable for entries. + */ +block_count_t vdo_get_recovery_journal_length(block_count_t journal_size) { block_count_t reserved_blocks = journal_size / 4; + if (reserved_blocks > RECOVERY_JOURNAL_RESERVED_BLOCKS) { reserved_blocks = RECOVERY_JOURNAL_RESERVED_BLOCKS; } @@ -362,11 +385,12 @@ block_count_t get_vdo_recovery_journal_length(block_count_t journal_size) } /** - * Attempt to reap the journal now that all the locks on some journal block - * have been released. This is the callback registered with the lock counter. + * reap_recovery_journal_callback() - Attempt to reap the journal. + * @completion: The lock counter completion. * - * @param completion The lock counter completion - **/ + * Attempts to reap the journal now that all the locks on some journal block + * have been released. This is the callback registered with the lock counter. + */ static void reap_recovery_journal_callback(struct vdo_completion *completion) { struct recovery_journal *journal = @@ -376,15 +400,15 @@ static void reap_recovery_journal_callback(struct vdo_completion *completion) * race between acknowledging the notification and unlocks wishing to * notify. */ - acknowledge_vdo_lock_unlock(journal->lock_counter); + vdo_acknowledge_lock_unlock(journal->lock_counter); - if (is_vdo_state_quiescing(&journal->state)) { + if (vdo_is_state_quiescing(&journal->state)) { /* * Don't start reaping when the journal is trying to quiesce. * Do check if this notification is the last thing the is * waiting on. */ - vdo_check_for_drain_complete(journal); + check_for_drain_complete(journal); return; } @@ -392,17 +416,18 @@ static void reap_recovery_journal_callback(struct vdo_completion *completion) check_slab_journal_commit_threshold(journal); } -/********************************************************************** - * Set the journal's tail sequence number. - * - * @param journal The journal whose tail is to be set - * @param tail The new tail value - **/ +/** + * set_journal_tail() - Set the journal's tail sequence number. + * @journal: The journal whose tail is to be set. + * @tail: The new tail value. + */ static void set_journal_tail(struct recovery_journal *journal, sequence_number_t tail) { - // VDO does not support sequence numbers above 1 << 48 in the slab - // journal. + /* + * VDO does not support sequence numbers above 1 << 48 in the slab + * journal. + */ if (tail >= (1ULL << 48)) { enter_journal_read_only_mode(journal, VDO_JOURNAL_OVERFLOW); } @@ -410,8 +435,25 @@ static void set_journal_tail(struct recovery_journal *journal, journal->tail = tail; } -/**********************************************************************/ -int decode_vdo_recovery_journal(struct recovery_journal_state_7_0 state, +/** + * vdo_decode_recovery_journal() - Make a recovery journal and initialize it + * with the state that was decoded from the + * super block. + * + * @state: The decoded state of the journal. + * @nonce: The nonce of the VDO. + * @vdo: The VDO. + * @partition: The partition for the journal. + * @recovery_count: The VDO's number of completed recoveries. + * @journal_size: The number of blocks in the journal on disk. + * @tail_buffer_size: The number of blocks for tail buffer. + * @read_only_notifier: The read-only mode notifier. + * @thread_config: The thread configuration of the VDO. + * @journal_ptr: The pointer to hold the new recovery journal. + * + * Return: A success or error code. + */ +int vdo_decode_recovery_journal(struct recovery_journal_state_7_0 state, nonce_t nonce, struct vdo *vdo, struct partition *partition, @@ -422,10 +464,10 @@ int decode_vdo_recovery_journal(struct recovery_journal_state_7_0 state, const struct thread_config *thread_config, struct recovery_journal **journal_ptr) { - block_count_t journal_length; block_count_t i; struct recovery_journal *journal; int result = UDS_ALLOCATE(1, struct recovery_journal, __func__, &journal); + if (result != VDO_SUCCESS) { return result; } @@ -443,29 +485,29 @@ int decode_vdo_recovery_journal(struct recovery_journal_state_7_0 state, journal->slab_journal_commit_threshold = (journal_size * 2) / 3; journal->logical_blocks_used = state.logical_blocks_used; journal->block_map_data_blocks = state.block_map_data_blocks; + journal->entries_per_block = RECOVERY_JOURNAL_ENTRIES_PER_BLOCK; set_journal_tail(journal, state.journal_start); initialize_journal_state(journal); - // XXX: this is a hack until we make initial resume of a VDO a real - // resume - set_vdo_admin_state_code(&journal->state, VDO_ADMIN_STATE_SUSPENDED); - - journal->entries_per_block = RECOVERY_JOURNAL_ENTRIES_PER_BLOCK; - journal_length = get_vdo_recovery_journal_length(journal_size); - journal->available_space = journal->entries_per_block * journal_length; + /* + * XXX: this is a hack until we make initial resume of a VDO a real + * resume + */ + vdo_set_admin_state_code(&journal->state, VDO_ADMIN_STATE_SUSPENDED); for (i = 0; i < tail_buffer_size; i++) { struct recovery_journal_block *block; - result = make_vdo_recovery_block(vdo, journal, &block); + + result = vdo_make_recovery_block(vdo, journal, &block); if (result != VDO_SUCCESS) { - free_vdo_recovery_journal(journal); + vdo_free_recovery_journal(journal); return result; } list_move_tail(&block->list_node, &journal->free_tail_blocks); } - result = make_vdo_lock_counter(vdo, + result = vdo_make_lock_counter(vdo, journal, reap_recovery_journal_callback, journal->thread_id, @@ -474,7 +516,7 @@ int decode_vdo_recovery_journal(struct recovery_journal_state_7_0 state, journal->size, &journal->lock_counter); if (result != VDO_SUCCESS) { - free_vdo_recovery_journal(journal); + vdo_free_recovery_journal(journal); return result; } @@ -485,27 +527,35 @@ int decode_vdo_recovery_journal(struct recovery_journal_state_7_0 state, NULL, &journal->flush_vio); if (result != VDO_SUCCESS) { - free_vdo_recovery_journal(journal); + vdo_free_recovery_journal(journal); return result; } - result = register_vdo_read_only_listener(read_only_notifier, + result = vdo_register_read_only_listener(read_only_notifier, journal, notify_recovery_journal_of_read_only_mode, journal->thread_id); if (result != VDO_SUCCESS) { - free_vdo_recovery_journal(journal); + vdo_free_recovery_journal(journal); + return result; + } + + result = vdo_make_default_thread(vdo, journal->thread_id); + if (result != VDO_SUCCESS) { + vdo_free_recovery_journal(journal); return result; } - journal->flush_vio->completion.callback_thread_id = - journal->thread_id; + journal->flush_vio->completion.callback_thread_id = journal->thread_id; *journal_ptr = journal; return VDO_SUCCESS; } -/**********************************************************************/ -void free_vdo_recovery_journal(struct recovery_journal *journal) +/** + * vdo_free_recovery_journal() - Free a recovery journal. + * @journal: The recovery journal to free. + */ +void vdo_free_recovery_journal(struct recovery_journal *journal) { struct recovery_journal_block *block; @@ -513,40 +563,55 @@ void free_vdo_recovery_journal(struct recovery_journal *journal) return; } - free_vdo_lock_counter(UDS_FORGET(journal->lock_counter)); + vdo_free_lock_counter(UDS_FORGET(journal->lock_counter)); free_vio(UDS_FORGET(journal->flush_vio)); - // XXX: eventually, the journal should be constructed in a quiescent - // state - // which requires opening before use. - if (!is_vdo_state_quiescent(&journal->state)) { + /* + * XXX: eventually, the journal should be constructed in a quiescent + * state + * which requires opening before use. + */ + if (!vdo_is_state_quiescent(&journal->state)) { ASSERT_LOG_ONLY(list_empty(&journal->active_tail_blocks), "journal being freed has no active tail blocks"); - } else if (!is_vdo_state_saved(&journal->state) + } else if (!vdo_is_state_saved(&journal->state) && !list_empty(&journal->active_tail_blocks)) { uds_log_warning("journal being freed has uncommitted entries"); } while ((block = pop_active_list(journal)) != NULL) { - free_vdo_recovery_block(block); + vdo_free_recovery_block(block); } while ((block = pop_free_list(journal)) != NULL) { - free_vdo_recovery_block(block); + vdo_free_recovery_block(block); } UDS_FREE(journal); } -/**********************************************************************/ -void set_vdo_recovery_journal_partition(struct recovery_journal *journal, +/** + * vdo_set_recovery_journal_partition() - Move the backing partition pointer + * of the recovery journal. + * @journal: The journal being moved. + * @partition: The new journal partition. + * + * Assumes that the data in the old and the new partitions is identical. + */ +void vdo_set_recovery_journal_partition(struct recovery_journal *journal, struct partition *partition) { journal->partition = partition; } -/**********************************************************************/ -void initialize_vdo_recovery_journal_post_recovery(struct recovery_journal *journal, +/** + * vdo_initialize_recovery_journal_post_recovery() - Initialize the journal + * after a recovery. + * @journal: The journal in question. + * @recovery_count: The number of completed recoveries. + * @tail: The new tail block sequence number. + */ +void vdo_initialize_recovery_journal_post_recovery(struct recovery_journal *journal, uint64_t recovery_count, sequence_number_t tail) { @@ -555,35 +620,61 @@ void initialize_vdo_recovery_journal_post_recovery(struct recovery_journal *jour initialize_journal_state(journal); } -/**********************************************************************/ +/** + * vdo_initialize_recovery_journal_post_rebuild() - Initialize the journal + * after a rebuild. + * @journal: The journal in question. + * @recovery_count: The number of completed recoveries. + * @tail: The new tail block sequence number. + * @logical_blocks_used: The new number of logical blocks used. + * @block_map_data_blocks: The new number of block map data blocks. + */ void -initialize_vdo_recovery_journal_post_rebuild(struct recovery_journal *journal, +vdo_initialize_recovery_journal_post_rebuild(struct recovery_journal *journal, uint64_t recovery_count, sequence_number_t tail, block_count_t logical_blocks_used, block_count_t block_map_data_blocks) { - initialize_vdo_recovery_journal_post_recovery(journal, recovery_count, + vdo_initialize_recovery_journal_post_recovery(journal, recovery_count, tail); journal->logical_blocks_used = logical_blocks_used; journal->block_map_data_blocks = block_map_data_blocks; } -/**********************************************************************/ +/** + * vdo_get_journal_block_map_data_blocks_used() - Get the number of block map + * pages, allocated from data + * blocks, currently in use. + * @journal: The journal in question. + * + * Return: The number of block map pages allocated from slabs. + */ block_count_t vdo_get_journal_block_map_data_blocks_used(struct recovery_journal *journal) { return journal->block_map_data_blocks; } -/**********************************************************************/ -thread_id_t get_vdo_recovery_journal_thread_id(struct recovery_journal *journal) +/** + * vdo_get_recovery_journal_thread_id() - Get the ID of a recovery journal's + * thread. + * @journal: The journal to query. + * + * Return: The ID of the journal's thread. + */ +thread_id_t vdo_get_recovery_journal_thread_id(struct recovery_journal *journal) { return journal->thread_id; } -/**********************************************************************/ -void open_vdo_recovery_journal(struct recovery_journal *journal, +/** + * vdo_open_recovery_journal() - Prepare the journal for new entries. + * @journal: The journal in question. + * @depot: The slab depot for this VDO. + * @block_map: The block map for this VDO. + */ +void vdo_open_recovery_journal(struct recovery_journal *journal, struct slab_depot *depot, struct block_map *block_map) { @@ -593,23 +684,33 @@ void open_vdo_recovery_journal(struct recovery_journal *journal, VDO_ADMIN_STATE_NORMAL_OPERATION); } -/**********************************************************************/ +/** + * vdo_record_recovery_journal() - Record the state of a recovery journal for + * encoding in the super block. + * @journal: the recovery journal. + * + * Return: the state of the journal. + */ struct recovery_journal_state_7_0 -record_vdo_recovery_journal(const struct recovery_journal *journal) +vdo_record_recovery_journal(const struct recovery_journal *journal) { struct recovery_journal_state_7_0 state = { .logical_blocks_used = journal->logical_blocks_used, .block_map_data_blocks = journal->block_map_data_blocks, }; - if (is_vdo_state_saved(&journal->state)) { - // If the journal is saved, we should start one past the active - // block (since the active block is not guaranteed to be empty). + if (vdo_is_state_saved(&journal->state)) { + /* + * If the journal is saved, we should start one past the active + * block (since the active block is not guaranteed to be empty). + */ state.journal_start = journal->tail; } else { - // When we're merely suspended or have gone read-only, we must - // record the first block that might have entries that need to - // be applied. + /* + * When we're merely suspended or have gone read-only, we must + * record the first block that might have entries that need to + * be applied. + */ state.journal_start = get_recovery_journal_head(journal); } @@ -617,12 +718,11 @@ record_vdo_recovery_journal(const struct recovery_journal *journal) } /** - * Advance the tail of the journal. - * - * @param journal The journal whose tail should be advanced + * advance_tail() - Advance the tail of the journal. + * @journal: The journal whose tail should be advanced. * - * @return true if the tail was advanced - **/ + * Return: true if the tail was advanced. + */ static bool advance_tail(struct recovery_journal *journal) { journal->active_block = pop_free_list(journal); @@ -632,22 +732,21 @@ static bool advance_tail(struct recovery_journal *journal) list_move_tail(&journal->active_block->list_node, &journal->active_tail_blocks); - initialize_vdo_recovery_block(journal->active_block); + vdo_initialize_recovery_block(journal->active_block); set_journal_tail(journal, journal->tail + 1); - advance_vdo_block_map_era(journal->block_map, journal->tail); + vdo_advance_block_map_era(journal->block_map, journal->tail); return true; } /** - * Check whether there is space to make a given type of entry. + * check_for_entry_space() - Check whether there is space to make a given type + * of entry. + * @journal: The journal to check. + * @increment: Set to true if the desired entry is an increment. * - * @param journal The journal to check - * @param increment Set to true if the desired entry is an - * increment - * - * @return true if there is space in the journal to make an - * entry of the specified type - **/ + * Return: true if there is space in the journal to make an entry of the + * specified type. + */ static bool check_for_entry_space(struct recovery_journal *journal, bool increment) { @@ -661,22 +760,21 @@ static bool check_for_entry_space(struct recovery_journal *journal, } /** - * Prepare the currently active block to receive an entry and check whether - * an entry of the given type may be assigned at this time. - * - * @param journal The journal receiving an entry - * @param increment Set to true if the desired entry is an - * increment + * prepare_to_assign_entry() - Prepare the currently active block to receive + * an entry and check whether an entry of the + * given type may be assigned at this time. + * @journal: The journal receiving an entry. + * @increment: Set to true if the desired entry is an increment. * - * @return true if there is space in the journal to store an - * entry of the specified type - **/ + * Return: true if there is space in the journal to store an entry of the + * specified type. + */ static bool prepare_to_assign_entry(struct recovery_journal *journal, bool increment) { if (!check_for_entry_space(journal, increment)) { if (!increment) { - // There must always be room to make a decrement entry. + /* There must always be room to make a decrement entry. */ uds_log_error("No space for decrement entry in recovery journal"); enter_journal_read_only_mode(journal, VDO_RECOVERY_JOURNAL_FULL); @@ -684,18 +782,18 @@ static bool prepare_to_assign_entry(struct recovery_journal *journal, return false; } - if (is_vdo_recovery_block_full(journal->active_block) + if (vdo_is_recovery_block_full(journal->active_block) && !advance_tail(journal)) { return false; } - if (!is_vdo_recovery_block_empty(journal->active_block)) { + if (!vdo_is_recovery_block_empty(journal->active_block)) { return true; } if ((journal->tail - get_recovery_journal_head(journal)) > journal->size) { - // Cannot use this block since the journal is full. + /* Cannot use this block since the journal is full. */ journal->events.disk_full++; return false; } @@ -707,7 +805,7 @@ static bool prepare_to_assign_entry(struct recovery_journal *journal, * after any slab journal entries have been made, the per-entry lock * for the block map entry serves to protect those as well. */ - initialize_vdo_lock_count(journal->lock_counter, + vdo_initialize_lock_count(journal->lock_counter, journal->active_block->block_number, journal->entries_per_block + 1); return true; @@ -716,14 +814,14 @@ static bool prepare_to_assign_entry(struct recovery_journal *journal, static void write_blocks(struct recovery_journal *journal); /** - * Queue a block for writing. The block is expected to be full. If the block - * is currently writing, this is a noop as the block will be queued for - * writing when the write finishes. The block must not currently be queued - * for writing. + * schedule_block_write() - Queue a block for writing. + * @journal: The journal in question. + * @block: The block which is now ready to write. * - * @param journal The journal in question - * @param block The block which is now ready to write - **/ + * The block is expected to be full. If the block is currently writing, this + * is a noop as the block will be queued for writing when the write finishes. + * The block must not currently be queued for writing. + */ static void schedule_block_write(struct recovery_journal *journal, struct recovery_journal_block *block) { @@ -734,7 +832,7 @@ static void schedule_block_write(struct recovery_journal *journal, } result = enqueue_waiter(&journal->pending_writes, - &block->write_waiter); + &block->write_waiter); if (result != VDO_SUCCESS) { enter_journal_read_only_mode(journal, result); } @@ -747,19 +845,20 @@ static void schedule_block_write(struct recovery_journal *journal, } /** - * Release a reference to a journal block. - * - * @param block The journal block from which to release a reference - **/ + * release_journal_block_reference() - Release a reference to a journal block. + * @block: The journal block from which to release a reference. + */ static void release_journal_block_reference(struct recovery_journal_block *block) { - release_vdo_journal_zone_reference(block->journal->lock_counter, + vdo_release_journal_zone_reference(block->journal->lock_counter, block->block_number); } /** - * Implements waiter_callback. Assign an entry waiter to the active block. - **/ + * assign_entry() - Assign an entry waiter to the active block. + * + * Implements waiter_callback. + */ static void assign_entry(struct waiter *waiter, void *context) { int result; @@ -768,35 +867,37 @@ static void assign_entry(struct waiter *waiter, void *context) (struct recovery_journal_block *)context; struct recovery_journal *journal = block->journal; - // Record the point at which we will make the journal entry. + /* Record the point at which we will make the journal entry. */ data_vio->recovery_journal_point = (struct journal_point) { .sequence_number = block->sequence_number, .entry_count = block->entry_count, }; switch (data_vio->operation.type) { - case DATA_INCREMENT: + case VDO_JOURNAL_DATA_INCREMENT: if (data_vio->operation.state != VDO_MAPPING_STATE_UNMAPPED) { journal->logical_blocks_used++; } journal->pending_decrement_count++; break; - case DATA_DECREMENT: + case VDO_JOURNAL_DATA_DECREMENT: if (data_vio->operation.state != VDO_MAPPING_STATE_UNMAPPED) { journal->logical_blocks_used--; } - // Per-entry locks need not be held for decrement entries since - // the lock held for the incref entry will protect this entry - // as well. + /* + * Per-entry locks need not be held for decrement entries since + * the lock held for the incref entry will protect this entry + * as well. + */ release_journal_block_reference(block); ASSERT_LOG_ONLY((journal->pending_decrement_count != 0), "decrement follows increment"); journal->pending_decrement_count--; break; - case BLOCK_MAP_INCREMENT: + case VDO_JOURNAL_BLOCK_MAP_INCREMENT: journal->block_map_data_blocks++; break; @@ -809,24 +910,25 @@ static void assign_entry(struct waiter *waiter, void *context) } journal->available_space--; - result = enqueue_vdo_recovery_block_entry(block, data_vio); + result = vdo_enqueue_recovery_block_entry(block, data_vio); if (result != VDO_SUCCESS) { enter_journal_read_only_mode(journal, result); continue_data_vio(data_vio, result); } - if (is_vdo_recovery_block_full(block)) { - // The block is full, so we can write it anytime henceforth. If - // it is already committing, we'll queue it for writing when it - // comes back. + if (vdo_is_recovery_block_full(block)) { + /* + * The block is full, so we can write it anytime henceforth. If + * it is already committing, we'll queue it for writing when it + * comes back. + */ schedule_block_write(journal, block); } - // Force out slab journal tail blocks when threshold is reached. + /* Force out slab journal tail blocks when threshold is reached. */ check_slab_journal_commit_threshold(journal); } -/**********************************************************************/ static bool assign_entries_from_queue(struct recovery_journal *journal, struct wait_queue *queue, bool increment) { @@ -841,11 +943,10 @@ static bool assign_entries_from_queue(struct recovery_journal *journal, return true; } -/**********************************************************************/ static void assign_entries(struct recovery_journal *journal) { if (journal->adding_entries) { - // Protect against re-entrancy. + /* Protect against re-entrancy. */ return; } @@ -856,31 +957,35 @@ static void assign_entries(struct recovery_journal *journal) true); } - // Now that we've finished with entries, see if we have a batch of - // blocks to write. + /* + * Now that we've finished with entries, see if we have a batch of + * blocks to write. + */ write_blocks(journal); journal->adding_entries = false; } /** - * Prepare an in-memory journal block to be reused now that it has been fully - * committed. - * - * @param block The block to be recycled - **/ + * recycle_journal_block() - Prepare an in-memory journal block to be reused + * now that it has been fully committed. + * @block: The block to be recycled. + */ static void recycle_journal_block(struct recovery_journal_block *block) { struct recovery_journal *journal = block->journal; block_count_t i; + list_move_tail(&block->list_node, &journal->free_tail_blocks); - // Release any unused entry locks. + /* Release any unused entry locks. */ for (i = block->entry_count; i < journal->entries_per_block; i++) { release_journal_block_reference(block); } - // Release our own lock against reaping now that the block is completely - // committed, or we're giving up because we're in read-only mode. + /* + * Release our own lock against reaping now that the block is completely + * committed, or we're giving up because we're in read-only mode. + */ if (block->entry_count > 0) { release_journal_block_reference(block); } @@ -891,16 +996,19 @@ static void recycle_journal_block(struct recovery_journal_block *block) } /** - * waiter_callback implementation invoked whenever a VIO is to be released - * from the journal because its entry was committed to disk. - **/ + * continue_committed_waiter() - invoked whenever a VIO is to be released from + * the journal because its entry was committed + * to disk. + * + * Implements waiter_callback. + */ static void continue_committed_waiter(struct waiter *waiter, void *context) { struct data_vio *data_vio = waiter_as_data_vio(waiter); struct recovery_journal *journal = (struct recovery_journal *)context; int result = (vdo_is_read_only(journal->read_only_notifier) ? VDO_READ_ONLY : VDO_SUCCESS); - ASSERT_LOG_ONLY(before_vdo_journal_point(&journal->commit_point, + ASSERT_LOG_ONLY(vdo_before_journal_point(&journal->commit_point, &data_vio->recovery_journal_point), "DataVIOs released from recovery journal in order. Recovery journal point is (%llu, %u), but commit waiter point is (%llu, %u)", (unsigned long long) journal->commit_point.sequence_number, @@ -913,13 +1021,13 @@ static void continue_committed_waiter(struct waiter *waiter, void *context) } /** - * Notify any VIOs whose entries have now committed. - * - * @param journal The recovery journal to update - **/ + * notify_commit_waiters() - Notify any VIOs whose entries have now committed. + * @journal: The recovery journal to update. + */ static void notify_commit_waiters(struct recovery_journal *journal) { struct list_head *entry; + if (list_empty(&journal->active_tail_blocks)) { return; } @@ -939,20 +1047,22 @@ static void notify_commit_waiters(struct recovery_journal *journal) notify_all_waiters(&block->entry_waiters, continue_committed_waiter, journal); - } else if (is_vdo_recovery_block_dirty(block) - || !is_vdo_recovery_block_full(block)) { - // Stop at partially-committed or partially-filled - // blocks. + } else if (vdo_is_recovery_block_dirty(block) + || !vdo_is_recovery_block_full(block)) { + /* + * Stop at partially-committed or partially-filled + * blocks. + */ return; } } } /** - * Recycle any journal blocks which have been fully committed. - * - * @param journal The recovery journal to update - **/ + * recycle_journal_blocks() - Recycle any journal blocks which have been fully + * committed. + * @journal: The recovery journal to update. + */ static void recycle_journal_blocks(struct recovery_journal *journal) { while (!list_empty(&journal->active_tail_blocks)) { @@ -960,15 +1070,17 @@ static void recycle_journal_blocks(struct recovery_journal *journal) = vdo_recovery_block_from_list_entry(journal->active_tail_blocks.next); if (block->committing) { - // Don't recycle committing blocks. - return; + /* Don't recycle committing blocks. */ + return; } if (!vdo_is_read_only(journal->read_only_notifier) - && (is_vdo_recovery_block_dirty(block) - || !is_vdo_recovery_block_full(block))) { - // Don't recycle partially written or partially full - // blocks, except in read-only mode. + && (vdo_is_recovery_block_dirty(block) + || !vdo_is_recovery_block_full(block))) { + /* + * Don't recycle partially written or partially full + * blocks, except in read-only mode. + */ return; } recycle_journal_block(block); @@ -976,17 +1088,19 @@ static void recycle_journal_blocks(struct recovery_journal *journal) } /** - * Handle post-commit processing. This is the callback registered by - * write_block(). If more entries accumulated in the block being committed - * while the commit was in progress, another commit will be initiated. + * complete_write() - Handle post-commit processing. + * @completion: The completion of the VIO writing this block. * - * @param completion The completion of the VIO writing this block - **/ + * This is the callback registered by write_block(). If more entries + * accumulated in the block being committed while the commit was in progress, + * another commit will be initiated. + */ static void complete_write(struct vdo_completion *completion) { struct recovery_journal_block *block = completion->parent; struct recovery_journal *journal = block->journal; struct recovery_journal_block *last_active_block; + assert_on_journal_thread(journal, __func__); journal->pending_write_count -= 1; @@ -996,8 +1110,10 @@ static void complete_write(struct vdo_completion *completion) block->entries_in_commit = 0; block->committing = false; - // If this block is the latest block to be acknowledged, record that - // fact. + /* + * If this block is the latest block to be acknowledged, record that + * fact. + */ if (block->sequence_number > journal->last_write_acknowledged) { journal->last_write_acknowledged = block->sequence_number; } @@ -1010,23 +1126,26 @@ static void complete_write(struct vdo_completion *completion) notify_commit_waiters(journal); - // Is this block now full? Reaping, and adding entries, might have - // already sent it off for rewriting; else, queue it for rewrite. - if (is_vdo_recovery_block_dirty(block) && is_vdo_recovery_block_full(block)) { + /* + * Is this block now full? Reaping, and adding entries, might have + * already sent it off for rewriting; else, queue it for rewrite. + */ + if (vdo_is_recovery_block_dirty(block) && vdo_is_recovery_block_full(block)) { schedule_block_write(journal, block); } recycle_journal_blocks(journal); write_blocks(journal); - vdo_check_for_drain_complete(journal); + check_for_drain_complete(journal); } -/**********************************************************************/ static void handle_write_error(struct vdo_completion *completion) { struct recovery_journal_block *block = completion->parent; struct recovery_journal *journal = block->journal; + + record_metadata_io_error(as_vio(completion)); uds_log_error_strerror(completion->result, "cannot write recovery journal block %llu", (unsigned long long) block->sequence_number); @@ -1034,9 +1153,20 @@ static void handle_write_error(struct vdo_completion *completion) complete_write(completion); } +static void complete_write_endio(struct bio *bio) +{ + struct vio *vio = bio->bi_private; + struct recovery_journal_block *block = vio->completion.parent; + struct recovery_journal *journal = block->journal; + + continue_vio_after_io(vio, complete_write, journal->thread_id); +} + /** - * Issue a block for writing. Implements waiter_callback. - **/ + * write_block() - Issue a block for writing. + * + * Implements waiter_callback. + */ static void write_block(struct waiter *waiter, void *context __always_unused) { int result; @@ -1048,7 +1178,8 @@ static void write_block(struct waiter *waiter, void *context __always_unused) return; } - result = commit_vdo_recovery_block(block, complete_write, + result = vdo_commit_recovery_block(block, + complete_write_endio, handle_write_error); if (result != VDO_SUCCESS) { enter_journal_read_only_mode(block->journal, result); @@ -1056,10 +1187,9 @@ static void write_block(struct waiter *waiter, void *context __always_unused) } /** - * Attempt to commit blocks, according to write policy. - * - * @param journal The recovery journal - **/ + * write_blocks() - Attempt to commit blocks, according to write policy. + * @journal: The recovery journal. + */ static void write_blocks(struct recovery_journal *journal) { assert_on_journal_thread(journal, __func__); @@ -1078,26 +1208,39 @@ static void write_blocks(struct recovery_journal *journal) return; } - // Write all the full blocks. + /* Write all the full blocks. */ notify_all_waiters(&journal->pending_writes, write_block, NULL); - // Do we need to write the active block? Only if we have no outstanding - // writes, even after issuing all of the full writes. + /* + * Do we need to write the active block? Only if we have no outstanding + * writes, even after issuing all of the full writes. + */ if ((journal->pending_write_count == 0) - && can_commit_vdo_recovery_block(journal->active_block)) { + && vdo_can_commit_recovery_block(journal->active_block)) { write_block(&journal->active_block->write_waiter, NULL); } } -/**********************************************************************/ -void add_vdo_recovery_journal_entry(struct recovery_journal *journal, +/** + * vdo_add_recovery_journal_entry() - Add an entry to a recovery journal. + * @journal: The journal in which to make an entry. + * @data_vio: The data_vio for which to add the entry. The entry will be taken + * from the logical and new_mapped fields of the data_vio. The + * data_vio's recovery_sequence_number field will be set to the + * sequence number of the journal block in which the entry was + * made. + * + * This method is asynchronous. The data_vio will not be called back until the + * entry is committed to the on-disk journal. + */ +void vdo_add_recovery_journal_entry(struct recovery_journal *journal, struct data_vio *data_vio) { bool increment; int result; assert_on_journal_thread(journal, __func__); - if (!is_vdo_state_normal(&journal->state)) { + if (!vdo_is_state_normal(&journal->state)) { continue_data_vio(data_vio, VDO_INVALID_ADMIN_STATE); return; } @@ -1107,12 +1250,12 @@ void add_vdo_recovery_journal_entry(struct recovery_journal *journal, return; } - increment = is_vdo_journal_increment_operation(data_vio->operation.type); + increment = vdo_is_journal_increment_operation(data_vio->operation.type); ASSERT_LOG_ONLY((!increment || (data_vio->recovery_sequence_number == 0)), "journal lock not held for increment"); - advance_vdo_journal_point(&journal->append_point, + vdo_advance_journal_point(&journal->append_point, journal->entries_per_block); result = enqueue_data_vio((increment ? &journal->increment_waiters : &journal->decrement_waiters), @@ -1127,29 +1270,33 @@ void add_vdo_recovery_journal_entry(struct recovery_journal *journal, } /** - * Conduct a sweep on a recovery journal to reclaim unreferenced blocks. - * - * @param journal The recovery journal - **/ + * reap_recovery_journal() - Conduct a sweep on a recovery journal to reclaim + * unreferenced blocks. + * @journal: The recovery journal. + */ static void reap_recovery_journal(struct recovery_journal *journal) { if (journal->reaping) { - // We already have an outstanding reap in progress. We need to - // wait for it to finish. + /* + * We already have an outstanding reap in progress. We need to + * wait for it to finish. + */ return; } - if (is_vdo_state_quiescent(&journal->state)) { - // We are supposed to not do IO. Don't botch it by reaping. + if (vdo_is_state_quiescent(&journal->state)) { + /* We are supposed to not do IO. Don't botch it by reaping. */ return; } - // Start reclaiming blocks only when the journal head has no - // references. Then stop when a block is referenced. + /* + * Start reclaiming blocks only when the journal head has no + * references. Then stop when a block is referenced. + */ while ((journal->block_map_reap_head < journal->last_write_acknowledged) - && !is_vdo_lock_locked(journal->lock_counter, + && !vdo_is_lock_locked(journal->lock_counter, journal->block_map_head_block_number, - ZONE_TYPE_LOGICAL)) { + VDO_ZONE_TYPE_LOGICAL)) { journal->block_map_reap_head++; if (++journal->block_map_head_block_number == journal->size) { journal->block_map_head_block_number = 0; @@ -1157,9 +1304,9 @@ static void reap_recovery_journal(struct recovery_journal *journal) } while ((journal->slab_journal_reap_head < journal->last_write_acknowledged) - && !is_vdo_lock_locked(journal->lock_counter, + && !vdo_is_lock_locked(journal->lock_counter, journal->slab_journal_head_block_number, - ZONE_TYPE_PHYSICAL)) { + VDO_ZONE_TYPE_PHYSICAL)) { journal->slab_journal_reap_head++; if (++journal->slab_journal_head_block_number == journal->size) { journal->slab_journal_head_block_number = 0; @@ -1168,7 +1315,7 @@ static void reap_recovery_journal(struct recovery_journal *journal) if ((journal->block_map_reap_head == journal->block_map_head) && (journal->slab_journal_reap_head == journal->slab_journal_head)) { - // Nothing happened. + /* Nothing happened. */ return; } @@ -1180,29 +1327,48 @@ static void reap_recovery_journal(struct recovery_journal *journal) * lock. */ journal->reaping = true; - launch_flush_vio(journal->flush_vio, complete_reaping, - handle_flush_error); + submit_flush_vio(journal->flush_vio, flush_endio, handle_flush_error); } -/**********************************************************************/ -void acquire_vdo_recovery_journal_block_reference(struct recovery_journal *journal, +/** + * vdo_acquire_recovery_journal_block_reference() - Acquire a reference to a + * recovery journal block + * from somewhere other than + * the journal itself. + * @journal: The recovery journal. + * @sequence_number: The journal sequence number of the referenced block. + * @zone_type: The type of the zone making the adjustment. + * @zone_id: The ID of the zone making the adjustment. + */ +void vdo_acquire_recovery_journal_block_reference(struct recovery_journal *journal, sequence_number_t sequence_number, enum vdo_zone_type zone_type, zone_count_t zone_id) { block_count_t block_number; + if (sequence_number == 0) { return; } block_number = - get_vdo_recovery_journal_block_number(journal, sequence_number); - acquire_vdo_lock_count_reference(journal->lock_counter, block_number, + vdo_get_recovery_journal_block_number(journal, sequence_number); + vdo_acquire_lock_count_reference(journal->lock_counter, block_number, zone_type, zone_id); } -/**********************************************************************/ -void release_vdo_recovery_journal_block_reference(struct recovery_journal *journal, +/** + * vdo_release_recovery_journal_block_reference() - Release a reference to a + * recovery journal block from somewhere other than the journal itself. + * @journal: The recovery journal. + * @sequence_number: The journal sequence number of the referenced block. + * @zone_type: The type of the zone making the adjustment. + * @zone_id: The ID of the zone making the adjustment. + * + * If this is the last reference for a given zone type, an attempt will be + * made to reap the journal. + */ +void vdo_release_recovery_journal_block_reference(struct recovery_journal *journal, sequence_number_t sequence_number, enum vdo_zone_type zone_type, zone_count_t zone_id) @@ -1214,59 +1380,82 @@ void release_vdo_recovery_journal_block_reference(struct recovery_journal *journ } block_number = - get_vdo_recovery_journal_block_number(journal, sequence_number); - release_vdo_lock_count_reference(journal->lock_counter, block_number, + vdo_get_recovery_journal_block_number(journal, sequence_number); + vdo_release_lock_count_reference(journal->lock_counter, block_number, zone_type, zone_id); } -/**********************************************************************/ +/** + * vdo_release_journal_per_entry_lock_from_other_zone() - Release a single + * per-entry reference + * count for a recovery + * journal block. + * @journal: The recovery journal. + * @sequence_number: The journal sequence number of the referenced block. + * + * Context: This method may be called from any zone (but shouldn't be called + * from the journal zone as it would be inefficient). + */ void vdo_release_journal_per_entry_lock_from_other_zone(struct recovery_journal *journal, sequence_number_t sequence_number) { block_count_t block_number; + if (sequence_number == 0) { return; } block_number = - get_vdo_recovery_journal_block_number(journal, sequence_number); - release_vdo_journal_zone_reference_from_other_zone(journal->lock_counter, + vdo_get_recovery_journal_block_number(journal, sequence_number); + vdo_release_journal_zone_reference_from_other_zone(journal->lock_counter, block_number); } /** - * Initiate a drain. + * initiate_drain() - Initiate a drain. * * Implements vdo_admin_initiator. - **/ + */ static void initiate_drain(struct admin_state *state) { - vdo_check_for_drain_complete(container_of(state, - struct recovery_journal, - state)); + check_for_drain_complete(container_of(state, + struct recovery_journal, + state)); } -/**********************************************************************/ -void drain_vdo_recovery_journal(struct recovery_journal *journal, +/** + * vdo_drain_recovery_journal() - Drain recovery journal I/O. + * @journal: The journal to drain. + * @operation: The drain operation (suspend or save). + * @parent: The completion to finish once the journal is drained. + * + * All uncommitted entries will be written out. + */ +void vdo_drain_recovery_journal(struct recovery_journal *journal, const struct admin_state_code *operation, struct vdo_completion *parent) { assert_on_journal_thread(journal, __func__); - start_vdo_draining(&journal->state, operation, parent, initiate_drain); + vdo_start_draining(&journal->state, operation, parent, initiate_drain); } -/**********************************************************************/ -void resume_vdo_recovery_journal(struct recovery_journal *journal, +/** + * vdo_resume_recovery_journal() - Resume a recovery journal which has been + * drained. + * @journal: The journal to resume. + * @parent: The completion to finish once the journal is resumed. + */ +void vdo_resume_recovery_journal(struct recovery_journal *journal, struct vdo_completion *parent) { bool saved; assert_on_journal_thread(journal, __func__); - saved = is_vdo_state_saved(&journal->state); - set_vdo_completion_result(parent, resume_vdo_if_quiescent(&journal->state)); + saved = vdo_is_state_saved(&journal->state); + vdo_set_completion_result(parent, vdo_resume_if_quiescent(&journal->state)); if (vdo_is_read_only(journal->read_only_notifier)) { - finish_vdo_completion(parent, VDO_READ_ONLY); + vdo_finish_completion(parent, VDO_READ_ONLY); return; } @@ -1274,36 +1463,53 @@ void resume_vdo_recovery_journal(struct recovery_journal *journal, initialize_journal_state(journal); } - if (resume_vdo_lock_counter(journal->lock_counter)) { - // We might have missed a notification. + if (vdo_resume_lock_counter(journal->lock_counter)) { + /* We might have missed a notification. */ reap_recovery_journal(journal); } - complete_vdo_completion(parent); + vdo_complete_completion(parent); } -/**********************************************************************/ +/** + * vdo_get_recovery_journal_logical_blocks_used() - Get the number of logical + * blocks in use by the VDO. + * @journal: The journal. + * + * Return: The number of logical blocks in use by the VDO. + */ block_count_t -get_vdo_recovery_journal_logical_blocks_used(const struct recovery_journal *journal) +vdo_get_recovery_journal_logical_blocks_used(const struct recovery_journal *journal) { return journal->logical_blocks_used; } -/**********************************************************************/ +/** + * vdo_get_recovery_journal_statistics() - Get the current statistics from the + * recovery journal. + * @journal: The recovery journal to query. + * + * Return: A copy of the current statistics for the journal. + */ struct recovery_journal_statistics -get_vdo_recovery_journal_statistics(const struct recovery_journal *journal) +vdo_get_recovery_journal_statistics(const struct recovery_journal *journal) { return journal->events; } -/**********************************************************************/ -void dump_vdo_recovery_journal_statistics(const struct recovery_journal *journal) +/** + * vdo_dump_recovery_journal_statistics() - Dump some current statistics and + * other debug info from the recovery + * journal. + * @journal: The recovery journal to dump. + */ +void vdo_dump_recovery_journal_statistics(const struct recovery_journal *journal) { const struct list_head *head; struct list_head *entry; struct recovery_journal_statistics stats = - get_vdo_recovery_journal_statistics(journal); + vdo_get_recovery_journal_statistics(journal); uds_log_info("Recovery Journal"); uds_log_info(" block_map_head=%llu slab_journal_head=%llu last_write_acknowledged=%llu tail=%llu block_map_reap_head=%llu slab_journal_reap_head=%llu disk_full=%llu slab_journal_commits_requested=%llu increment_waiters=%zu decrement_waiters=%zu", (unsigned long long) journal->block_map_head, @@ -1328,6 +1534,6 @@ void dump_vdo_recovery_journal_statistics(const struct recovery_journal *journal uds_log_info(" active blocks:"); head = &journal->active_tail_blocks; list_for_each(entry, head) { - dump_vdo_recovery_block(vdo_recovery_block_from_list_entry(entry)); + vdo_dump_recovery_block(vdo_recovery_block_from_list_entry(entry)); } } diff --git a/vdo/recovery-journal.h b/vdo/recovery-journal.h new file mode 100644 index 00000000..47901d89 --- /dev/null +++ b/vdo/recovery-journal.h @@ -0,0 +1,313 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright Red Hat + */ + +#ifndef RECOVERY_JOURNAL_H +#define RECOVERY_JOURNAL_H + +#include + +#include "numeric.h" + +#include "admin-state.h" +#include "completion.h" +#include "flush.h" +#include "journal-point.h" +#include "lock-counter.h" +#include "read-only-notifier.h" +#include "recovery-journal.h" +#include "recovery-journal-format.h" +#include "statistics.h" +#include "types.h" +#include "vdo-layout.h" +#include "wait-queue.h" + +/** + * DOC: recovery journal. + * + * The recovery_journal provides a log of all block mapping and reference count + * changes which have not yet been stably written to the block map or slab + * journals. This log helps to reduce the write amplification of writes by + * providing amortization of slab journal and block map page updates. + * + * The journal consists of a set of on-disk blocks arranged as a circular log + * with monotonically increasing sequence numbers. Three sequence numbers serve + * to define the active extent of the journal. The 'head' is the oldest active + * block in the journal. The 'tail' is the end of the half-open interval + * containing the active blocks. 'active' is the number of the block actively + * receiving entries. In an empty journal, head == active == tail. Once any + * entries are added, tail = active + 1, and head may be any value in the + * interval [tail - size, active]. + * + * The journal also contains a set of in-memory blocks which are used to buffer + * up entries until they can be committed. In general the number of in-memory + * blocks ('tail_buffer_count') will be less than the on-disk size. Each + * in-memory block is also a vdo_completion. Each in-memory block has a vio + * which is used to commit that block to disk. The vio's data is the on-disk + * representation of the journal block. In addition each in-memory block has a + * buffer which is used to accumulate entries while a partial commit of the + * block is in progress. In-memory blocks are kept on two rings. Free blocks + * live on the 'free_tail_blocks' ring. When a block becomes active (see below) + * it is moved to the 'active_tail_blocks' ring. When a block is fully + * committed, it is moved back to the 'free_tail_blocks' ring. + * + * When entries are added to the journal, they are added to the active + * in-memory block, as indicated by the 'active_block' field. If the caller + * wishes to wait for the entry to be committed, the requesting VIO will be + * attached to the in-memory block to which the caller's entry was added. If + * the caller does wish to wait, or if the entry filled the active block, an + * attempt will be made to commit that block to disk. If there is already + * another commit in progress, the attempt will be ignored and then + * automatically retried when the in-progress commit completes. If there is no + * commit in progress, any data_vios waiting on the block are transferred to + * the block's vio which is then written, automatically waking all of the + * waiters when it completes. When the write completes, any entries which + * accumulated in the block are copied to the vio's data buffer. + * + * Finally, the journal maintains a set of counters, one for each on disk + * journal block. These counters are used as locks to prevent premature reaping + * of journal blocks. Each time a new sequence number is used, the counter for + * the corresponding block is incremented. The counter is subsequently + * decremented when that block is filled and then committed for the last + * time. This prevents blocks from being reaped while they are still being + * updated. The counter is also incremented once for each entry added to a + * block, and decremented once each time the block map is updated in memory for + * that request. This prevents blocks from being reaped while their VIOs are + * still active. Finally, each in-memory block map page tracks the oldest + * journal block that contains entries corresponding to uncommitted updates to + * that block map page. Each time an in-memory block map page is updated, it + * checks if the journal block for the VIO is earlier than the one it + * references, in which case it increments the count on the earlier journal + * block and decrements the count on the later journal block, maintaining a + * lock on the oldest journal block containing entries for that page. When a + * block map page has been flushed from the cache, the counter for the journal + * block it references is decremented. Whenever the counter for the head block + * goes to 0, the head is advanced until it comes to a block whose counter is + * not 0 or until it reaches the active block. This is the mechanism for + * reclaiming journal space on disk. + * + * If there is no in-memory space when a VIO attempts to add an entry, the VIO + * will be attached to the 'commit_completion' and will be woken the next time + * a full block has committed. If there is no on-disk space when a VIO attempts + * to add an entry, the VIO will be attached to the 'reap_completion', and will + * be woken the next time a journal block is reaped. + */ + +struct recovery_journal { + /* The thread ID of the journal zone */ + thread_id_t thread_id; + /* The slab depot which can hold locks on this journal */ + struct slab_depot *depot; + /* The block map which can hold locks on this journal */ + struct block_map *block_map; + /* The queue of vios waiting to make increment entries */ + struct wait_queue increment_waiters; + /* The queue of vios waiting to make decrement entries */ + struct wait_queue decrement_waiters; + /* The number of free entries in the journal */ + uint64_t available_space; + /* The number of decrement entries which need to be made */ + vio_count_t pending_decrement_count; + /* + * Whether the journal is adding entries from the increment or + * decrement waiters queues + */ + bool adding_entries; + /* The notifier for read-only mode */ + struct read_only_notifier *read_only_notifier; + /* The administrative state of the journal */ + struct admin_state state; + /* Whether a reap is in progress */ + bool reaping; + /* The partition which holds the journal on disk */ + struct partition *partition; + /* The oldest active block in the journal on disk for block map rebuild + */ + sequence_number_t block_map_head; + /* The oldest active block in the journal on disk for slab journal + * replay */ + sequence_number_t slab_journal_head; + /* The newest block in the journal on disk to which a write has + * finished */ + sequence_number_t last_write_acknowledged; + /* The end of the half-open interval of the active journal */ + sequence_number_t tail; + /* The point at which the last entry will have been added */ + struct journal_point append_point; + /* The journal point of the vio most recently released from the journal + */ + struct journal_point commit_point; + /* The nonce of the VDO */ + nonce_t nonce; + /* The number of recoveries completed by the VDO */ + uint8_t recovery_count; + /* The number of entries which fit in a single block */ + journal_entry_count_t entries_per_block; + /* Unused in-memory journal blocks */ + struct list_head free_tail_blocks; + /* In-memory journal blocks with records */ + struct list_head active_tail_blocks; + /* A pointer to the active block (the one we are adding entries to now) + */ + struct recovery_journal_block *active_block; + /* Journal blocks that need writing */ + struct wait_queue pending_writes; + /* The new block map reap head after reaping */ + sequence_number_t block_map_reap_head; + /* The head block number for the block map rebuild range */ + block_count_t block_map_head_block_number; + /* The new slab journal reap head after reaping */ + sequence_number_t slab_journal_reap_head; + /* The head block number for the slab journal replay range */ + block_count_t slab_journal_head_block_number; + /* The data-less vio, usable only for flushing */ + struct vio *flush_vio; + /* The number of blocks in the on-disk journal */ + block_count_t size; + /* The number of logical blocks that are in-use */ + block_count_t logical_blocks_used; + /* The number of block map pages that are allocated */ + block_count_t block_map_data_blocks; + /* The number of journal blocks written but not yet acknowledged */ + block_count_t pending_write_count; + /* The threshold at which slab journal tail blocks will be written out + */ + block_count_t slab_journal_commit_threshold; + /* Counters for events in the journal that are reported as statistics + */ + struct recovery_journal_statistics events; + /* The locks for each on-disk block */ + struct lock_counter *lock_counter; +}; + +/** + * vdo_get_recovery_journal_block_number() - Get the physical block number for + * a given sequence number. + * @journal: The journal. + * @sequence: The sequence number of the desired block. + * + * Return: The block number corresponding to the sequence number. + */ +static inline physical_block_number_t __must_check +vdo_get_recovery_journal_block_number(const struct recovery_journal *journal, + sequence_number_t sequence) +{ + /* + * Since journal size is a power of two, the block number modulus can + * just be extracted from the low-order bits of the sequence. + */ + return vdo_compute_recovery_journal_block_number(journal->size, sequence); +} + +/** + * vdo_compute_recovery_journal_check_byte() - Compute the check byte for a + * given sequence number. + * @journal: The journal. + * @sequence: The sequence number. + * + * Return: The check byte corresponding to the sequence number. + */ +static inline uint8_t __must_check +vdo_compute_recovery_journal_check_byte(const struct recovery_journal *journal, + sequence_number_t sequence) +{ + /* The check byte must change with each trip around the journal. */ + return (((sequence / journal->size) & 0x7F) | 0x80); +} + +/** + * vdo_is_journal_increment_operation() - Return whether a given + * journal_operation is an increment + * type. + * @operation: The operation in question. + * + * Return: true if the type is an increment type. + */ +static inline bool +vdo_is_journal_increment_operation(enum journal_operation operation) +{ + return ((operation == VDO_JOURNAL_DATA_INCREMENT) + || (operation == VDO_JOURNAL_BLOCK_MAP_INCREMENT)); +} + +int __must_check +vdo_decode_recovery_journal(struct recovery_journal_state_7_0 state, + nonce_t nonce, + struct vdo *vdo, + struct partition *partition, + uint64_t recovery_count, + block_count_t journal_size, + block_count_t tail_buffer_size, + struct read_only_notifier *read_only_notifier, + const struct thread_config *thread_config, + struct recovery_journal **journal_ptr); + +void vdo_free_recovery_journal(struct recovery_journal *journal); + +void vdo_set_recovery_journal_partition(struct recovery_journal *journal, + struct partition *partition); + +void +vdo_initialize_recovery_journal_post_recovery(struct recovery_journal *journal, + uint64_t recovery_count, + sequence_number_t tail); + +void +vdo_initialize_recovery_journal_post_rebuild(struct recovery_journal *journal, + uint64_t recovery_count, + sequence_number_t tail, + block_count_t logical_blocks_used, + block_count_t block_map_data_blocks); + +block_count_t __must_check +vdo_get_journal_block_map_data_blocks_used(struct recovery_journal *journal); + +thread_id_t __must_check +vdo_get_recovery_journal_thread_id(struct recovery_journal *journal); + +void vdo_open_recovery_journal(struct recovery_journal *journal, + struct slab_depot *depot, + struct block_map *block_map); + +sequence_number_t +vdo_get_recovery_journal_current_sequence_number(struct recovery_journal *journal); + +block_count_t __must_check +vdo_get_recovery_journal_length(block_count_t journal_size); + +struct recovery_journal_state_7_0 __must_check +vdo_record_recovery_journal(const struct recovery_journal *journal); + +void vdo_add_recovery_journal_entry(struct recovery_journal *journal, + struct data_vio *data_vio); + +void vdo_acquire_recovery_journal_block_reference(struct recovery_journal *journal, + sequence_number_t sequence_number, + enum vdo_zone_type zone_type, + zone_count_t zone_id); + +void vdo_release_recovery_journal_block_reference(struct recovery_journal *journal, + sequence_number_t sequence_number, + enum vdo_zone_type zone_type, + zone_count_t zone_id); + +void vdo_release_journal_per_entry_lock_from_other_zone(struct recovery_journal *journal, + sequence_number_t sequence_number); + +void vdo_drain_recovery_journal(struct recovery_journal *journal, + const struct admin_state_code *operation, + struct vdo_completion *parent); + +void vdo_resume_recovery_journal(struct recovery_journal *journal, + struct vdo_completion *parent); + +block_count_t __must_check +vdo_get_recovery_journal_logical_blocks_used(const struct recovery_journal *journal); + +struct recovery_journal_statistics __must_check +vdo_get_recovery_journal_statistics(const struct recovery_journal *journal); + +void vdo_dump_recovery_journal_statistics(const struct recovery_journal *journal); + +#endif /* RECOVERY_JOURNAL_H */ diff --git a/vdo/recovery-utils.c b/vdo/recovery-utils.c new file mode 100644 index 00000000..024fd39b --- /dev/null +++ b/vdo/recovery-utils.c @@ -0,0 +1,283 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright Red Hat + */ + +#include "recovery-utils.h" + +#include + +#include "logger.h" +#include "memory-alloc.h" + +#include "completion.h" +#include "io-submitter.h" +#include "kernel-types.h" +#include "num-utils.h" +#include "packed-recovery-journal-block.h" +#include "recovery-journal-entry.h" +#include "recovery-journal.h" +#include "slab-depot.h" +#include "types.h" +#include "vdo.h" +#include "vdo-component.h" +#include "vdo-component-states.h" +#include "vio.h" + +struct journal_loader { + struct vdo_completion *parent; + thread_id_t thread_id; + physical_block_number_t pbn; + vio_count_t count; + vio_count_t complete; + struct vio *vios[]; +}; + +static void free_journal_loader(struct journal_loader *loader) +{ + vio_count_t v; + + if (loader == NULL) { + return; + } + + for (v = 0; v < loader->count; v++) { + free_vio(UDS_FORGET(loader->vios[v])); + } + + UDS_FREE(loader); +} + +/** + * finish_journal_load() - Handle the completion of a journal read, and if it + * is the last one, finish the load by notifying the + * parent. + **/ +static void finish_journal_load(struct vdo_completion *completion) +{ + int result = completion->result; + struct journal_loader *loader = completion->parent; + + if (++loader->complete == loader->count) { + vdo_finish_completion(loader->parent, result); + free_journal_loader(loader); + } +} + +static void handle_journal_load_error(struct vdo_completion *completion) +{ + record_metadata_io_error(as_vio(completion)); + completion->callback(completion); +} + +static void read_journal_endio(struct bio *bio) +{ + struct vio *vio = bio->bi_private; + struct journal_loader *loader = vio->completion.parent; + + continue_vio_after_io(vio, finish_journal_load, loader->thread_id); +} + +/** + * vdo_load_recovery_journal() - Load the journal data off the disk. + * @journal: The recovery journal to load. + * @parent: The completion to notify when the load is complete. + * @journal_data_ptr: A pointer to the journal data buffer (it is the + * caller's responsibility to free this buffer). + */ +void vdo_load_recovery_journal(struct recovery_journal *journal, + struct vdo_completion *parent, + char **journal_data_ptr) +{ + char *ptr; + struct journal_loader *loader; + physical_block_number_t pbn = + vdo_get_fixed_layout_partition_offset(journal->partition); + vio_count_t vio_count = DIV_ROUND_UP(journal->size, + MAX_BLOCKS_PER_VIO); + block_count_t remaining = journal->size; + int result = UDS_ALLOCATE(journal->size * VDO_BLOCK_SIZE, + char, + __func__, + journal_data_ptr); + + if (result != VDO_SUCCESS) { + vdo_finish_completion(parent, result); + return; + } + + result = UDS_ALLOCATE_EXTENDED(struct journal_loader, + vio_count, + struct vio *, + __func__, + &loader); + if (result != VDO_SUCCESS) { + vdo_finish_completion(parent, result); + return; + } + + loader->thread_id = vdo_get_callback_thread_id(); + loader->parent = parent; + ptr = *journal_data_ptr; + for (loader->count = 0; loader->count < vio_count; loader->count++) { + unsigned short blocks = + min(remaining, (block_count_t) MAX_BLOCKS_PER_VIO); + + result = create_multi_block_metadata_vio(parent->vdo, + VIO_TYPE_RECOVERY_JOURNAL, + VIO_PRIORITY_METADATA, + loader, + blocks, + ptr, + &loader->vios[loader->count]); + if (result != VDO_SUCCESS) { + free_journal_loader(UDS_FORGET(loader)); + vdo_finish_completion(parent, result); + return; + } + + ptr += (blocks * VDO_BLOCK_SIZE); + remaining -= blocks; + } + + for (vio_count = 0; + vio_count < loader->count; + vio_count++, pbn += MAX_BLOCKS_PER_VIO) { + submit_metadata_vio(loader->vios[vio_count], + pbn, + read_journal_endio, + handle_journal_load_error, + REQ_OP_READ); + } +} + +/** + * is_congruent_recovery_journal_block() - Determine whether the given + * header describes a valid + * block for the given journal + * that could appear at the + * given offset in the + * journal. + * @journal: The journal to use. + * @header: The unpacked block header to check. + * @offset: An offset indicating where the block was in the journal. + * + * Return: True if the header matches. + */ +static bool __must_check +is_congruent_recovery_journal_block(struct recovery_journal *journal, + const struct recovery_block_header *header, + physical_block_number_t offset) +{ + physical_block_number_t expected_offset = + vdo_get_recovery_journal_block_number(journal, + header->sequence_number); + return ((expected_offset == offset) + && vdo_is_valid_recovery_journal_block(journal, header)); +} + +/** + * vdo_find_recovery_journal_head_and_tail() - Find the tail and head of the + * journal. + * @journal: The recovery journal. + * @journal_data: The journal data read from disk. + * @tail_ptr: A pointer to return the tail found, or if no higher + * block is found, the value currently in the journal. + * @block_map_head_ptr: A pointer to return the block map head. + * @slab_journal_head_ptr: An optional pointer to return the slab journal head. + * + * Finds the tail and the head of the journal by searching for the highest + * sequence number in a block with a valid nonce, and the highest head value + * among the blocks with valid nonces. + * + * Return: True if there were valid journal blocks + */ +bool vdo_find_recovery_journal_head_and_tail(struct recovery_journal *journal, + char *journal_data, + sequence_number_t *tail_ptr, + sequence_number_t *block_map_head_ptr, + sequence_number_t *slab_journal_head_ptr) +{ + sequence_number_t highest_tail = journal->tail; + sequence_number_t block_map_head_max = 0; + sequence_number_t slab_journal_head_max = 0; + bool found_entries = false; + physical_block_number_t i; + + for (i = 0; i < journal->size; i++) { + struct packed_journal_header *packed_header = + vdo_get_recovery_journal_block_header(journal, + journal_data, + i); + struct recovery_block_header header; + + vdo_unpack_recovery_block_header(packed_header, &header); + + if (!is_congruent_recovery_journal_block(journal, &header, i)) { + /* + * This block is old, unformatted, or doesn't belong at + * this location. + */ + continue; + } + + if (header.sequence_number >= highest_tail) { + found_entries = true; + highest_tail = header.sequence_number; + } + if (header.block_map_head > block_map_head_max) { + block_map_head_max = header.block_map_head; + } + if (header.slab_journal_head > slab_journal_head_max) { + slab_journal_head_max = header.slab_journal_head; + } + } + + *tail_ptr = highest_tail; + if (!found_entries) { + return false; + } + + *block_map_head_ptr = block_map_head_max; + if (slab_journal_head_ptr != NULL) { + *slab_journal_head_ptr = slab_journal_head_max; + } + return true; +} + +/** + * vdo_validate_recovery_journal_entry() - Validate a recovery journal entry. + * @vdo: The vdo. + * @entry: The entry to validate. + * + * Return: VDO_SUCCESS or an error. + */ +int +vdo_validate_recovery_journal_entry(const struct vdo *vdo, + const struct recovery_journal_entry *entry) +{ + if ((entry->slot.pbn >= vdo->states.vdo.config.physical_blocks) || + (entry->slot.slot >= VDO_BLOCK_MAP_ENTRIES_PER_PAGE) || + !vdo_is_valid_location(&entry->mapping) || + !vdo_is_physical_data_block(vdo->depot, entry->mapping.pbn)) { + return uds_log_error_strerror(VDO_CORRUPT_JOURNAL, + "Invalid entry: (%llu, %u) to %llu (%s) is not within bounds", + (unsigned long long) entry->slot.pbn, + entry->slot.slot, + (unsigned long long) entry->mapping.pbn, + vdo_get_journal_operation_name(entry->operation)); + } + + if ((entry->operation == VDO_JOURNAL_BLOCK_MAP_INCREMENT) && + (vdo_is_state_compressed(entry->mapping.state) || + (entry->mapping.pbn == VDO_ZERO_BLOCK))) { + return uds_log_error_strerror(VDO_CORRUPT_JOURNAL, + "Invalid entry: (%llu, %u) to %llu (%s) is not a valid tree mapping", + (unsigned long long) entry->slot.pbn, + entry->slot.slot, + (unsigned long long) entry->mapping.pbn, + vdo_get_journal_operation_name(entry->operation)); + } + + return VDO_SUCCESS; +} diff --git a/vdo/recovery-utils.h b/vdo/recovery-utils.h new file mode 100644 index 00000000..74d0aa51 --- /dev/null +++ b/vdo/recovery-utils.h @@ -0,0 +1,90 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright Red Hat + */ + +#ifndef RECOVERY_UTILS_H +#define RECOVERY_UTILS_H + +#include "constants.h" +#include "packed-recovery-journal-block.h" +#include "recovery-journal-entry.h" +#include "recovery-journal.h" +#include "types.h" + +/** + * vdo_get_recovery_journal_block_header() - Get the block header for a block + * at a position in the journal + * data. + * @journal: The recovery journal. + * @journal_data: The recovery journal data. + * @sequence: The sequence number. + * + * Return: A pointer to a packed recovery journal block header. + */ +static inline struct packed_journal_header * __must_check +vdo_get_recovery_journal_block_header(struct recovery_journal *journal, + char *journal_data, + sequence_number_t sequence) +{ + off_t block_offset = + (vdo_get_recovery_journal_block_number(journal, sequence) + * VDO_BLOCK_SIZE); + return (struct packed_journal_header *) &journal_data[block_offset]; +} + +/** + * vdo_is_valid_recovery_journal_block() - Determine whether the given header + * describes a valid block for the + * given journal. + * @journal: The journal to use. + * @header: The unpacked block header to check. + * + * A block is not valid if it is unformatted, or if it is older than the last + * successful recovery or reformat. + * + * Return: True if the header is valid. + */ +static inline bool __must_check +vdo_is_valid_recovery_journal_block(const struct recovery_journal *journal, + const struct recovery_block_header *header) +{ + return ((header->metadata_type == VDO_METADATA_RECOVERY_JOURNAL) + && (header->nonce == journal->nonce) + && (header->recovery_count == journal->recovery_count)); +} + +/** + * vdo_is_exact_recovery_journal_block() - Determine whether the given header + * describes the exact block indicated. + * @journal: The journal to use. + * @header: The unpacked block header to check. + * @sequence: The expected sequence number. + * + * Return: True if the block matches. + */ +static inline bool __must_check +vdo_is_exact_recovery_journal_block(const struct recovery_journal *journal, + const struct recovery_block_header *header, + sequence_number_t sequence) +{ + return ((header->sequence_number == sequence) + && vdo_is_valid_recovery_journal_block(journal, header)); +} + +void vdo_load_recovery_journal(struct recovery_journal *journal, + struct vdo_completion *parent, + char **journal_data_ptr); + +bool +vdo_find_recovery_journal_head_and_tail(struct recovery_journal *journal, + char *journal_data, + sequence_number_t *tail_ptr, + sequence_number_t *block_map_head_ptr, + sequence_number_t *slab_journal_head_ptr); + +int __must_check +vdo_validate_recovery_journal_entry(const struct vdo *vdo, + const struct recovery_journal_entry *entry); + +#endif /* RECOVERY_UTILS_H */ diff --git a/vdo/recoveryJournal.h b/vdo/recoveryJournal.h deleted file mode 100644 index a5a606ac..00000000 --- a/vdo/recoveryJournal.h +++ /dev/null @@ -1,364 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/recoveryJournal.h#17 $ - */ - -#ifndef RECOVERY_JOURNAL_H -#define RECOVERY_JOURNAL_H - -#include "adminState.h" -#include "completion.h" -#include "fixedLayout.h" -#include "flush.h" -#include "readOnlyNotifier.h" -#include "recoveryJournalFormat.h" -#include "statistics.h" -#include "types.h" - -/** - * The recovery_journal provides a log of all block mapping and reference count - * changes which have not yet been stably written to the block map or slab - * journals. This log helps to reduce the write amplification of writes by - * providing amortization of slab journal and block map page updates. - * - * The journal consists of a set of on-disk blocks arranged as a - * circular log with monotonically increasing sequence numbers. Three - * sequence numbers serve to define the active extent of the - * journal. The 'head' is the oldest active block in the journal. The - * 'tail' is the end of the half-open interval containing the active - * blocks. 'active' is the number of the block actively receiving - * entries. In an empty journal, head == active == tail. Once any - * entries are added, tail = active + 1, and head may be any value in - * the interval [tail - size, active]. - * - * The journal also contains a set of in-memory blocks which are used - * to buffer up entries until they can be committed. In general the - * number of in-memory blocks ('tail_buffer_count') will be less than - * the on-disk size. Each in-memory block is also a vdo_completion. - * Each in-memory block has a vdo_extent which is used to commit that - * block to disk. The extent's data is the on-disk representation - * of the journal block. In addition each in-memory block has a - * buffer which is used to accumulate entries while a partial commit - * of the block is in progress. In-memory blocks are kept on two - * rings. Free blocks live on the 'free_tail_blocks' ring. When a block - * becomes active (see below) it is moved to the 'active_tail_blocks' - * ring. When a block is fully committed, it is moved back to the - * 'free_tail_blocks' ring. - * - * When entries are added to the journal, they are added to the active - * in-memory block, as indicated by the 'active_block' field. If the - * caller wishes to wait for the entry to be committed, the requesting - * VIO will be attached to the in-memory block to which the caller's - * entry was added. If the caller does wish to wait, or if the entry - * filled the active block, an attempt will be made to commit that - * block to disk. If there is already another commit in progress, the - * attempt will be ignored and then automatically retried when the - * in-progress commit completes. If there is no commit in progress, - * any VIOs waiting on the block are transferred to the extent. The - * extent is then written, automatically waking all of the waiters - * when it completes. When the extent completes, any entries which - * accumulated in the block are copied to the extent's data buffer. - * - * Finally, the journal maintains a set of counters, one for each on - * disk journal block. These counters are used as locks to prevent - * premature reaping of journal blocks. Each time a new sequence - * number is used, the counter for the corresponding block is - * incremented. The counter is subsequently decremented when that - * block is filled and then committed for the last time. This prevents - * blocks from being reaped while they are still being updated. The - * counter is also incremented once for each entry added to a block, - * and decremented once each time the block map is updated in memory - * for that request. This prevents blocks from being reaped while - * their VIOs are still active. Finally, each in-memory block map page - * tracks the oldest journal block that contains entries corresponding to - * uncommitted updates to that block map page. Each time an in-memory block - * map page is updated, it checks if the journal block for the VIO - * is earlier than the one it references, in which case it increments - * the count on the earlier journal block and decrements the count on the - * later journal block, maintaining a lock on the oldest journal block - * containing entries for that page. When a block map page has been flushed - * from the cache, the counter for the journal block it references is - * decremented. Whenever the counter for the head block goes to 0, the - * head is advanced until it comes to a block whose counter is not 0 - * or until it reaches the active block. This is the mechanism for - * reclaiming journal space on disk. - * - * If there is no in-memory space when a VIO attempts to add an entry, - * the VIO will be attached to the 'commit_completion' and will be - * woken the next time a full block has committed. If there is no - * on-disk space when a VIO attempts to add an entry, the VIO will be - * attached to the 'reap_completion', and will be woken the next time a - * journal block is reaped. - **/ - -/** - * Return whether a given journal_operation is an increment type. - * - * @param operation The operation in question - * - * @return true if the type is an increment type - **/ -static inline bool -is_vdo_journal_increment_operation(enum journal_operation operation) -{ - return ((operation == DATA_INCREMENT) - || (operation == BLOCK_MAP_INCREMENT)); -} - -/** - * Make a recovery journal and initialize it with the state that was decoded - * from the super block. - * - * @param [in] state the decoded state of the journal - * @param [in] nonce the nonce of the VDO - * @param [in] vdo the VDO - * @param [in] partition the partition for the journal - * @param [in] recovery_count the VDO's number of completed recoveries - * @param [in] journal_size the number of blocks in the journal on disk - * @param [in] tail_buffer_size the number of blocks for tail buffer - * @param [in] read_only_notifier the read-only mode notifier - * @param [in] thread_config the thread configuration of the VDO - * @param [out] journal_ptr the pointer to hold the new recovery journal - * - * @return a success or error code - **/ -int __must_check -decode_vdo_recovery_journal(struct recovery_journal_state_7_0 state, - nonce_t nonce, - struct vdo *vdo, - struct partition *partition, - uint64_t recovery_count, - block_count_t journal_size, - block_count_t tail_buffer_size, - struct read_only_notifier *read_only_notifier, - const struct thread_config *thread_config, - struct recovery_journal **journal_ptr); - -/** - * Free a recovery journal. - * - * @param journal The recovery journal to free - **/ -void free_vdo_recovery_journal(struct recovery_journal *journal); - -/** - * Move the backing partition pointer of the recovery journal. - * Assumes that the data in the old and the new partitions is identical. - * - * @param journal the journal being moved - * @param partition the new journal partition - **/ -void set_vdo_recovery_journal_partition(struct recovery_journal *journal, - struct partition *partition); - -/** - * Initialize the journal after a recovery. - * - * @param journal The journal in question - * @param recovery_count The number of completed recoveries - * @param tail The new tail block sequence number - **/ -void -initialize_vdo_recovery_journal_post_recovery(struct recovery_journal *journal, - uint64_t recovery_count, - sequence_number_t tail); - -/** - * Initialize the journal after a rebuild. - * - * @param journal The journal in question - * @param recovery_count The number of completed recoveries - * @param tail The new tail block sequence number - * @param logical_blocks_used The new number of logical blocks used - * @param block_map_data_blocks The new number of block map data blocks - **/ -void -initialize_vdo_recovery_journal_post_rebuild(struct recovery_journal *journal, - uint64_t recovery_count, - sequence_number_t tail, - block_count_t logical_blocks_used, - block_count_t block_map_data_blocks); - -/** - * Get the number of block map pages, allocated from data blocks, currently - * in use. - * - * @param journal The journal in question - * - * @return The number of block map pages allocated from slabs - **/ -block_count_t __must_check -vdo_get_journal_block_map_data_blocks_used(struct recovery_journal *journal); - -/** - * Get the ID of a recovery journal's thread. - * - * @param journal The journal to query - * - * @return The ID of the journal's thread. - **/ -thread_id_t __must_check -get_vdo_recovery_journal_thread_id(struct recovery_journal *journal); - -/** - * Prepare the journal for new entries. - * - * @param journal The journal in question - * @param depot The slab depot for this VDO - * @param block_map The block map for this VDO - **/ -void open_vdo_recovery_journal(struct recovery_journal *journal, - struct slab_depot *depot, - struct block_map *block_map); - -/** - * Obtain the recovery journal's current sequence number. Exposed only so - * the block map can be initialized therefrom. - * - * @param journal The journal in question - * - * @return the sequence number of the tail block - **/ -sequence_number_t -get_vdo_recovery_journal_current_sequence_number(struct recovery_journal *journal); - -/** - * Get the number of usable recovery journal blocks. - * - * @param journal_size The size of the recovery journal in blocks - * - * @return the number of recovery journal blocks usable for entries - **/ -block_count_t __must_check -get_vdo_recovery_journal_length(block_count_t journal_size); - -/** - * Record the state of a recovery journal for encoding in the super block. - * - * @param journal the recovery journal - * - * @return the state of the journal - **/ -struct recovery_journal_state_7_0 __must_check -record_vdo_recovery_journal(const struct recovery_journal *journal); - -/** - * Add an entry to a recovery journal. This method is asynchronous. The - * data_vio will not be called back until the entry is committed to the - * on-disk journal. - * - * @param journal The journal in which to make an entry - * @param data_vio The data_vio for which to add the entry. The entry will be - * taken from the logical and new_mapped fields of the - * data_vio. The data_vio's recovery_sequence_number field - * will be set to the sequence number of the journal block in - * which the entry was made. - **/ -void add_vdo_recovery_journal_entry(struct recovery_journal *journal, - struct data_vio *data_vio); - -/** - * Acquire a reference to a recovery journal block from somewhere other than - * the journal itself. - * - * @param journal The recovery journal - * @param sequence_number The journal sequence number of the referenced block - * @param zone_type The type of the zone making the adjustment - * @param zone_id The ID of the zone making the adjustment - **/ -void acquire_vdo_recovery_journal_block_reference(struct recovery_journal *journal, - sequence_number_t sequence_number, - enum vdo_zone_type zone_type, - zone_count_t zone_id); - -/** - * Release a reference to a recovery journal block from somewhere other than - * the journal itself. If this is the last reference for a given zone type, - * an attempt will be made to reap the journal. - * - * @param journal The recovery journal - * @param sequence_number The journal sequence number of the referenced block - * @param zone_type The type of the zone making the adjustment - * @param zone_id The ID of the zone making the adjustment - **/ -void release_vdo_recovery_journal_block_reference(struct recovery_journal *journal, - sequence_number_t sequence_number, - enum vdo_zone_type zone_type, - zone_count_t zone_id); - -/** - * Release a single per-entry reference count for a recovery journal block. This - * method may be called from any zone (but shouldn't be called from the journal - * zone as it would be inefficient). - * - * @param journal The recovery journal - * @param sequence_number The journal sequence number of the referenced block - **/ -void vdo_release_journal_per_entry_lock_from_other_zone(struct recovery_journal *journal, - sequence_number_t sequence_number); - -/** - * Drain recovery journal I/O. All uncommitted entries will be written out. - * - * @param journal The journal to drain - * @param operation The drain operation (suspend or save) - * @param parent The completion to finish once the journal is drained - **/ -void drain_vdo_recovery_journal(struct recovery_journal *journal, - const struct admin_state_code *operation, - struct vdo_completion *parent); - -/** - * Resume a recovery journal which has been drained. - * - * @param journal The journal to resume - * @param parent The completion to finish once the journal is resumed - **/ -void resume_vdo_recovery_journal(struct recovery_journal *journal, - struct vdo_completion *parent); - -/** - * Get the number of logical blocks in use by the VDO - * - * @param journal the journal - * - * @return the number of logical blocks in use by the VDO - **/ -block_count_t __must_check -get_vdo_recovery_journal_logical_blocks_used(const struct recovery_journal *journal); - -/** - * Get the current statistics from the recovery journal. - * - * @param journal The recovery journal to query - * - * @return a copy of the current statistics for the journal - **/ -struct recovery_journal_statistics __must_check -get_vdo_recovery_journal_statistics(const struct recovery_journal *journal); - -/** - * Dump some current statistics and other debug info from the recovery - * journal. - * - * @param journal The recovery journal to dump - **/ -void dump_vdo_recovery_journal_statistics(const struct recovery_journal *journal); - -#endif // RECOVERY_JOURNAL_H diff --git a/vdo/recoveryJournalBlock.h b/vdo/recoveryJournalBlock.h deleted file mode 100644 index 71792fd5..00000000 --- a/vdo/recoveryJournalBlock.h +++ /dev/null @@ -1,197 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/recoveryJournalBlock.h#11 $ - */ - -#ifndef RECOVERY_JOURNAL_BLOCK_H -#define RECOVERY_JOURNAL_BLOCK_H - -#include "permassert.h" - -#include "packedRecoveryJournalBlock.h" -#include "recoveryJournalInternals.h" -#include "types.h" -#include "waitQueue.h" - -struct recovery_journal_block { - /** The doubly linked pointers for the free or active lists */ - struct list_head list_node; - /** The waiter for the pending full block list */ - struct waiter write_waiter; - /** The journal to which this block belongs */ - struct recovery_journal *journal; - /** A pointer to a block-sized buffer holding the packed block data */ - char *block; - /** A pointer to the current sector in the packed block buffer */ - struct packed_journal_sector *sector; - /** The vio for writing this block */ - struct vio *vio; - /** The sequence number for this block */ - sequence_number_t sequence_number; - /** The location of this block in the on-disk journal */ - physical_block_number_t block_number; - /** Whether this block is being committed */ - bool committing; - /** - * Whether this block has an uncommitted increment for a write with FUA - */ - bool has_fua_entry; - /** The total number of entries in this block */ - journal_entry_count_t entry_count; - /** The total number of uncommitted entries (queued or committing) */ - journal_entry_count_t uncommitted_entry_count; - /** The number of new entries in the current commit */ - journal_entry_count_t entries_in_commit; - /** The queue of vios which will make entries for the next commit */ - struct wait_queue entry_waiters; - /** The queue of vios waiting for the current commit */ - struct wait_queue commit_waiters; -}; - -/** - * Return the block associated with a list entry. - * - * @param entry The list entry to recast as a block - * - * @return The block - **/ -static inline struct recovery_journal_block * -vdo_recovery_block_from_list_entry(struct list_head *entry) -{ - return list_entry(entry, struct recovery_journal_block, list_node); -} - -/** - * Check whether a recovery block is dirty, indicating it has any uncommitted - * entries, which includes both entries not written and entries written but - * not yet acknowledged. - * - * @param block The block to check - * - * @return true if the block has any uncommitted entries - **/ -static inline bool __must_check -is_vdo_recovery_block_dirty(const struct recovery_journal_block *block) -{ - return (block->uncommitted_entry_count > 0); -} - -/** - * Check whether a journal block is empty. - * - * @param block The block to check - * - * @return true if the block has no entries - **/ -static inline bool __must_check -is_vdo_recovery_block_empty(const struct recovery_journal_block *block) -{ - return (block->entry_count == 0); -} - -/** - * Check whether a journal block is full. - * - * @param block The block to check - * - * @return true if the the block is full - **/ -static inline bool __must_check -is_vdo_recovery_block_full(const struct recovery_journal_block *block) -{ - return ((block == NULL) - || (block->journal->entries_per_block == block->entry_count)); -} - -/** - * Construct a journal block. - * - * @param [in] vdo The vdo from which to construct vios - * @param [in] journal The journal to which the block will belong - * @param [out] block_ptr A pointer to receive the new block - * - * @return VDO_SUCCESS or an error - **/ -int __must_check -make_vdo_recovery_block(struct vdo *vdo, - struct recovery_journal *journal, - struct recovery_journal_block **block_ptr); - -/** - * Free a tail block. - * - * @param block The tail block to free - **/ -void free_vdo_recovery_block(struct recovery_journal_block *block); - -/** - * Initialize the next active recovery journal block. - * - * @param block The journal block to initialize - **/ -void initialize_vdo_recovery_block(struct recovery_journal_block *block); - -/** - * Enqueue a data_vio to asynchronously encode and commit its next recovery - * journal entry in this block. The data_vio will not be continued until the - * entry is committed to the on-disk journal. The caller is responsible for - * ensuring the block is not already full. - * - * @param block The journal block in which to make an entry - * @param data_vio The data_vio to enqueue - * - * @return VDO_SUCCESS or an error code if the data_vio could not be enqueued - **/ -int __must_check -enqueue_vdo_recovery_block_entry(struct recovery_journal_block *block, - struct data_vio *data_vio); - -/** - * Attempt to commit a block. If the block is not the oldest block with - * uncommitted entries or if it is already being committed, nothing will be - * done. - * - * @param block The block to write - * @param callback The function to call when the write completes - * @param error_handler The handler for flush or write errors - * - * @return VDO_SUCCESS, or an error if the write could not be launched - **/ -int __must_check commit_vdo_recovery_block(struct recovery_journal_block *block, - vdo_action *callback, - vdo_action *error_handler); - -/** - * Dump the contents of the recovery block to the log. - * - * @param block The block to dump - **/ -void dump_vdo_recovery_block(const struct recovery_journal_block *block); - -/** - * Check whether a journal block can be committed. - * - * @param block The journal block in question - * - * @return true if the block can be committed now - **/ -bool __must_check -can_commit_vdo_recovery_block(struct recovery_journal_block *block); - -#endif // RECOVERY_JOURNAL_BLOCK_H diff --git a/vdo/recoveryJournalFormat.h b/vdo/recoveryJournalFormat.h deleted file mode 100644 index 6073c695..00000000 --- a/vdo/recoveryJournalFormat.h +++ /dev/null @@ -1,123 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/recoveryJournalFormat.h#8 $ - */ - -#ifndef RECOVERY_JOURNAL_FORMAT_H -#define RECOVERY_JOURNAL_FORMAT_H - -#include "buffer.h" - -#include "header.h" -#include "packedRecoveryJournalBlock.h" -#include "types.h" - -/* - * The state of the recovery journal as encoded in the VDO super block. - */ -struct recovery_journal_state_7_0 { - /** Sequence number to start the journal */ - sequence_number_t journal_start; - /** Number of logical blocks used by VDO */ - block_count_t logical_blocks_used; - /** Number of block map pages allocated */ - block_count_t block_map_data_blocks; -} __packed; - -extern const struct header VDO_RECOVERY_JOURNAL_HEADER_7_0; - -/** - * Get the size of the encoded state of a recovery journal. - * - * @return the encoded size of the journal's state - **/ -size_t __must_check get_vdo_recovery_journal_encoded_size(void); - -/** - * Encode the state of a recovery journal. - * - * @param state the recovery journal state - * @param buffer the buffer to encode into - * - * @return VDO_SUCCESS or an error code - **/ -int __must_check -encode_vdo_recovery_journal_state_7_0(struct recovery_journal_state_7_0 state, - struct buffer *buffer); - -/** - * Decode the state of a recovery journal saved in a buffer. - * - * @param buffer the buffer containing the saved state - * @param state a pointer to a recovery journal state to hold the result of a - * succesful decode - * - * @return VDO_SUCCESS or an error code - **/ -int __must_check -decode_vdo_recovery_journal_state_7_0(struct buffer *buffer, - struct recovery_journal_state_7_0 *state); - -/** - * Get the name of a journal operation. - * - * @param operation The operation to name - * - * @return The name of the operation - **/ -const char * __must_check -get_vdo_journal_operation_name(enum journal_operation operation); - -/** - * Determine whether the header of the given sector could describe a - * valid sector for the given journal block header. - * - * @param header The unpacked block header to compare against - * @param sector The packed sector to check - * - * @return True if the sector matches the block header - **/ -static inline bool __must_check -is_valid_vdo_recovery_journal_sector(const struct recovery_block_header *header, - const struct packed_journal_sector *sector) -{ - return ((header->check_byte == sector->check_byte) - && (header->recovery_count == sector->recovery_count)); -} - -/** - * Compute the physical block number of the recovery journal block which would - * have a given sequence number. - * - * @param journal_size The size of the journal - * @param sequence_number The sequence number - * - * @return The pbn of the journal block which would the specified sequence - * number - **/ -static inline physical_block_number_t __must_check -compute_vdo_recovery_journal_block_number(block_count_t journal_size, - sequence_number_t sequence_number) -{ - // Since journal size is a power of two, the block number modulus can - // just be extracted from the low-order bits of the sequence. - return (sequence_number & (journal_size - 1)); -} - -#endif // RECOVERY_JOURNAL_FORMAT_H diff --git a/vdo/recoveryJournalInternals.h b/vdo/recoveryJournalInternals.h deleted file mode 100644 index 42dab20d..00000000 --- a/vdo/recoveryJournalInternals.h +++ /dev/null @@ -1,160 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/recoveryJournalInternals.h#4 $ - */ - -#ifndef RECOVERY_JOURNAL_INTERNALS_H -#define RECOVERY_JOURNAL_INTERNALS_H - -#include - -#include "numeric.h" - -#include "adminState.h" -#include "fixedLayout.h" -#include "journalPoint.h" -#include "lockCounter.h" -#include "recoveryJournal.h" -#include "statistics.h" -#include "types.h" -#include "waitQueue.h" - -struct recovery_journal_block; - -struct recovery_journal { - /** The thread ID of the journal zone */ - thread_id_t thread_id; - /** The slab depot which can hold locks on this journal */ - struct slab_depot *depot; - /** The block map which can hold locks on this journal */ - struct block_map *block_map; - /** The queue of vios waiting to make increment entries */ - struct wait_queue increment_waiters; - /** The queue of vios waiting to make decrement entries */ - struct wait_queue decrement_waiters; - /** The number of free entries in the journal */ - uint64_t available_space; - /** The number of decrement entries which need to be made */ - vio_count_t pending_decrement_count; - /** - * Whether the journal is adding entries from the increment or - * decrement waiters queues - **/ - bool adding_entries; - /** The notifier for read-only mode */ - struct read_only_notifier *read_only_notifier; - /** The administrative state of the journal */ - struct admin_state state; - /** Whether a reap is in progress */ - bool reaping; - /** The partition which holds the journal on disk */ - struct partition *partition; - /** The oldest active block in the journal on disk for block map rebuild - */ - sequence_number_t block_map_head; - /** The oldest active block in the journal on disk for slab journal - * replay */ - sequence_number_t slab_journal_head; - /** The newest block in the journal on disk to which a write has - * finished */ - sequence_number_t last_write_acknowledged; - /** The end of the half-open interval of the active journal */ - sequence_number_t tail; - /** The point at which the last entry will have been added */ - struct journal_point append_point; - /** The journal point of the vio most recently released from the journal - */ - struct journal_point commit_point; - /** The nonce of the VDO */ - nonce_t nonce; - /** The number of recoveries completed by the VDO */ - uint8_t recovery_count; - /** The number of entries which fit in a single block */ - journal_entry_count_t entries_per_block; - /** Unused in-memory journal blocks */ - struct list_head free_tail_blocks; - /** In-memory journal blocks with records */ - struct list_head active_tail_blocks; - /** A pointer to the active block (the one we are adding entries to now) - */ - struct recovery_journal_block *active_block; - /** Journal blocks that need writing */ - struct wait_queue pending_writes; - /** The new block map reap head after reaping */ - sequence_number_t block_map_reap_head; - /** The head block number for the block map rebuild range */ - block_count_t block_map_head_block_number; - /** The new slab journal reap head after reaping */ - sequence_number_t slab_journal_reap_head; - /** The head block number for the slab journal replay range */ - block_count_t slab_journal_head_block_number; - /** The data-less vio, usable only for flushing */ - struct vio *flush_vio; - /** The number of blocks in the on-disk journal */ - block_count_t size; - /** The number of logical blocks that are in-use */ - block_count_t logical_blocks_used; - /** The number of block map pages that are allocated */ - block_count_t block_map_data_blocks; - /** The number of journal blocks written but not yet acknowledged */ - block_count_t pending_write_count; - /** The threshold at which slab journal tail blocks will be written out - */ - block_count_t slab_journal_commit_threshold; - /** Counters for events in the journal that are reported as statistics - */ - struct recovery_journal_statistics events; - /** The locks for each on-disk block */ - struct lock_counter *lock_counter; -}; - -/** - * Get the physical block number for a given sequence number. - * - * @param journal The journal - * @param sequence The sequence number of the desired block - * - * @return The block number corresponding to the sequence number - **/ -static inline physical_block_number_t __must_check -get_vdo_recovery_journal_block_number(const struct recovery_journal *journal, - sequence_number_t sequence) -{ - // Since journal size is a power of two, the block number modulus can - // just be extracted from the low-order bits of the sequence. - return compute_vdo_recovery_journal_block_number(journal->size, sequence); -} - -/** - * Compute the check byte for a given sequence number. - * - * @param journal The journal - * @param sequence The sequence number - * - * @return The check byte corresponding to the sequence number - **/ -static inline uint8_t __must_check -compute_vdo_recovery_journal_check_byte(const struct recovery_journal *journal, - sequence_number_t sequence) -{ - // The check byte must change with each trip around the journal. - return (((sequence / journal->size) & 0x7F) | 0x80); -} - -#endif // RECOVERY_JOURNAL_INTERNALS_H diff --git a/vdo/recoveryUtils.c b/vdo/recoveryUtils.c deleted file mode 100644 index 4c62afb7..00000000 --- a/vdo/recoveryUtils.c +++ /dev/null @@ -1,180 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/recoveryUtils.c#26 $ - */ - -#include "recoveryUtils.h" - -#include "logger.h" -#include "memoryAlloc.h" - -#include "completion.h" -#include "extent.h" -#include "packedRecoveryJournalBlock.h" -#include "recoveryJournalEntry.h" -#include "recoveryJournalInternals.h" -#include "slabDepot.h" -#include "vdoComponent.h" -#include "vdoComponentStates.h" -#include "vdoInternal.h" - -/** - * Finish loading the journal by freeing the extent and notifying the parent. - * This callback is registered in load_vdo_recovery_journal(). - * - * @param completion The load extent - **/ -static void finish_journal_load(struct vdo_completion *completion) -{ - int result = completion->result; - struct vdo_completion *parent = completion->parent; - free_vdo_extent(vdo_completion_as_extent(UDS_FORGET(completion))); - finish_vdo_completion(parent, result); -} - -/**********************************************************************/ -void load_vdo_recovery_journal(struct recovery_journal *journal, - struct vdo_completion *parent, - char **journal_data_ptr) -{ - struct vdo_extent *extent; - int result = UDS_ALLOCATE(journal->size * VDO_BLOCK_SIZE, char, - __func__, journal_data_ptr); - if (result != VDO_SUCCESS) { - finish_vdo_completion(parent, result); - return; - } - - result = create_vdo_extent(parent->vdo, VIO_TYPE_RECOVERY_JOURNAL, - VIO_PRIORITY_METADATA, journal->size, - *journal_data_ptr, &extent); - if (result != VDO_SUCCESS) { - finish_vdo_completion(parent, result); - return; - } - - prepare_vdo_completion(&extent->completion, finish_journal_load, - finish_journal_load, parent->callback_thread_id, - parent); - read_vdo_metadata_extent(extent, - get_vdo_fixed_layout_partition_offset(journal->partition)); -} - -/** - * Determine whether the given header describe a valid block for the - * given journal that could appear at the given offset in the journal. - * - * @param journal The journal to use - * @param header The unpacked block header to check - * @param offset An offset indicating where the block was in the journal - * - * @return True if the header matches - **/ -static bool __must_check -is_congruent_recovery_journal_block(struct recovery_journal *journal, - const struct recovery_block_header *header, - physical_block_number_t offset) -{ - physical_block_number_t expected_offset = - get_vdo_recovery_journal_block_number(journal, - header->sequence_number); - return ((expected_offset == offset) - && is_valid_vdo_recovery_journal_block(journal, header)); -} - -/**********************************************************************/ -bool find_vdo_recovery_journal_head_and_tail(struct recovery_journal *journal, - char *journal_data, - sequence_number_t *tail_ptr, - sequence_number_t *block_map_head_ptr, - sequence_number_t *slab_journal_head_ptr) -{ - sequence_number_t highest_tail = journal->tail; - sequence_number_t block_map_head_max = 0; - sequence_number_t slab_journal_head_max = 0; - bool found_entries = false; - physical_block_number_t i; - for (i = 0; i < journal->size; i++) { - struct packed_journal_header *packed_header = - get_vdo_recovery_journal_block_header(journal, - journal_data, - i); - struct recovery_block_header header; - unpack_vdo_recovery_block_header(packed_header, &header); - - if (!is_congruent_recovery_journal_block(journal, &header, i)) { - // This block is old, unformatted, or doesn't belong at - // this location. - continue; - } - - if (header.sequence_number >= highest_tail) { - found_entries = true; - highest_tail = header.sequence_number; - } - if (header.block_map_head > block_map_head_max) { - block_map_head_max = header.block_map_head; - } - if (header.slab_journal_head > slab_journal_head_max) { - slab_journal_head_max = header.slab_journal_head; - } - } - - *tail_ptr = highest_tail; - if (!found_entries) { - return false; - } - - *block_map_head_ptr = block_map_head_max; - if (slab_journal_head_ptr != NULL) { - *slab_journal_head_ptr = slab_journal_head_max; - } - return true; -} - -/**********************************************************************/ -int -validate_vdo_recovery_journal_entry(const struct vdo *vdo, - const struct recovery_journal_entry *entry) -{ - if ((entry->slot.pbn >= vdo->states.vdo.config.physical_blocks) || - (entry->slot.slot >= VDO_BLOCK_MAP_ENTRIES_PER_PAGE) || - !vdo_is_valid_location(&entry->mapping) || - !vdo_is_physical_data_block(vdo->depot, entry->mapping.pbn)) { - return uds_log_error_strerror(VDO_CORRUPT_JOURNAL, - "Invalid entry: (%llu, %u) to %llu (%s) is not within bounds", - (unsigned long long) entry->slot.pbn, - entry->slot.slot, - (unsigned long long) entry->mapping.pbn, - get_vdo_journal_operation_name(entry->operation)); - } - - if ((entry->operation == BLOCK_MAP_INCREMENT) && - (vdo_is_state_compressed(entry->mapping.state) || - (entry->mapping.pbn == VDO_ZERO_BLOCK))) { - return uds_log_error_strerror(VDO_CORRUPT_JOURNAL, - "Invalid entry: (%llu, %u) to %llu (%s) is not a valid tree mapping", - (unsigned long long) entry->slot.pbn, - entry->slot.slot, - (unsigned long long) entry->mapping.pbn, - get_vdo_journal_operation_name(entry->operation)); - } - - return VDO_SUCCESS; -} diff --git a/vdo/recoveryUtils.h b/vdo/recoveryUtils.h deleted file mode 100644 index b686bef3..00000000 --- a/vdo/recoveryUtils.h +++ /dev/null @@ -1,137 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/recoveryUtils.h#10 $ - */ - -#ifndef RECOVERY_UTILS_H -#define RECOVERY_UTILS_H - -#include "constants.h" -#include "packedRecoveryJournalBlock.h" -#include "recoveryJournalEntry.h" -#include "recoveryJournalInternals.h" -#include "types.h" - -/** - * Get the block header for a block at a position in the journal data. - * - * @param journal The recovery journal - * @param journal_data The recovery journal data - * @param sequence The sequence number - * - * @return A pointer to a packed recovery journal blokck header. - **/ -static inline struct packed_journal_header * __must_check -get_vdo_recovery_journal_block_header(struct recovery_journal *journal, - char *journal_data, - sequence_number_t sequence) -{ - off_t block_offset = - (get_vdo_recovery_journal_block_number(journal, sequence) - * VDO_BLOCK_SIZE); - return (struct packed_journal_header *) &journal_data[block_offset]; -} - -/** - * Determine whether the given header describes a valid block for the - * given journal. A block is not valid if it is unformatted, or if it - * is older than the last successful recovery or reformat. - * - * @param journal The journal to use - * @param header The unpacked block header to check - * - * @return True if the header is valid - **/ -static inline bool __must_check -is_valid_vdo_recovery_journal_block(const struct recovery_journal *journal, - const struct recovery_block_header *header) -{ - return ((header->metadata_type == VDO_METADATA_RECOVERY_JOURNAL) - && (header->nonce == journal->nonce) - && (header->recovery_count == journal->recovery_count)); -} - -/** - * Determine whether the given header describes the exact block indicated. - * - * @param journal The journal to use - * @param header The unpacked block header to check - * @param sequence The expected sequence number - * - * @return True if the block matches - **/ -static inline bool __must_check -is_exact_vdo_recovery_journal_block(const struct recovery_journal *journal, - const struct recovery_block_header *header, - sequence_number_t sequence) -{ - return ((header->sequence_number == sequence) - && is_valid_vdo_recovery_journal_block(journal, header)); -} - -/** - * Load the journal data off the disk. - * - * @param [in] journal The recovery journal to load - * @param [in] parent The completion to notify when the load is - * complete - * @param [out] journal_data_ptr A pointer to the journal data buffer (it is - * the caller's responsibility to free this - * buffer) - **/ -void load_vdo_recovery_journal(struct recovery_journal *journal, - struct vdo_completion *parent, - char **journal_data_ptr); - -/** - * Find the tail and the head of the journal by searching for the highest - * sequence number in a block with a valid nonce, and the highest head value - * among the blocks with valid nonces. - * - * @param [in] journal The recovery journal - * @param [in] journal_data The journal data read from disk - * @param [out] tail_ptr A pointer to return the tail found, or if - * no higher block is found, the value - * currently in the journal - * @param [out] block_map_head_ptr A pointer to return the block map head - * @param [out] slab_journal_head_ptr An optional pointer to return the slab - * journal head - * - * @return True if there were valid journal blocks - **/ -bool -find_vdo_recovery_journal_head_and_tail(struct recovery_journal *journal, - char *journal_data, - sequence_number_t *tail_ptr, - sequence_number_t *block_map_head_ptr, - sequence_number_t *slab_journal_head_ptr); - -/** - * Validate a recovery journal entry. - * - * @param vdo The vdo - * @param entry The entry to validate - * - * @return VDO_SUCCESS or an error - **/ -int __must_check -validate_vdo_recovery_journal_entry(const struct vdo *vdo, - const struct recovery_journal_entry *entry); - -#endif // RECOVERY_UTILS_H diff --git a/vdo/refCounts.c b/vdo/ref-counts.c similarity index 53% rename from vdo/refCounts.c rename to vdo/ref-counts.c index c18927e2..5b9baa74 100644 --- a/vdo/refCounts.c +++ b/vdo/ref-counts.c @@ -1,64 +1,49 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/refCounts.c#31 $ */ -#include "refCounts.h" -#include "refCountsInternals.h" +#include "ref-counts.h" + +#include #include "logger.h" -#include "memoryAlloc.h" +#include "memory-alloc.h" #include "numeric.h" #include "permassert.h" -#include "adminState.h" -#include "blockAllocatorInternals.h" +#include "admin-state.h" +#include "block-allocator.h" #include "completion.h" -#include "extent.h" #include "header.h" -#include "journalPoint.h" -#include "numUtils.h" -#include "packedReferenceBlock.h" -#include "pbnLock.h" -#include "readOnlyNotifier.h" -#include "referenceOperation.h" +#include "io-submitter.h" +#include "journal-point.h" +#include "num-utils.h" +#include "packed-reference-block.h" +#include "pbn-lock.h" +#include "read-only-notifier.h" +#include "reference-operation.h" #include "slab.h" -#include "slabDepotFormat.h" -#include "slabJournal.h" -#include "slabJournalInternals.h" -#include "slabSummary.h" -#include "statusCodes.h" -#include "stringUtils.h" +#include "slab-depot-format.h" +#include "slab-journal.h" +#include "slab-summary.h" +#include "status-codes.h" +#include "string-utils.h" #include "vdo.h" -#include "vioPool.h" -#include "waitQueue.h" +#include "vio.h" +#include "vio-pool.h" +#include "wait-queue.h" static const uint64_t BYTES_PER_WORD = sizeof(uint64_t); static const bool NORMAL_OPERATION = true; /** - * Return the ref_counts from the ref_counts waiter. - * - * @param waiter The waiter to convert + * ref_counts_from_waiter() - Return the ref_counts from the ref_counts + * waiter. + * @waiter: The waiter to convert. * - * @return The ref_counts - **/ + * Return: The ref_counts. + */ static inline struct ref_counts * __must_check ref_counts_from_waiter(struct waiter *waiter) { @@ -69,15 +54,16 @@ ref_counts_from_waiter(struct waiter *waiter) } /** - * Convert the index of a reference counter back to the block number of the - * physical block for which it is counting references. The index is assumed to - * be valid and in-range. + * index_to_pbn() - Convert the index of a reference counter back to the block + * number of the physical block for which it is counting + * references. + * @ref_counts: The reference counts object. + * @index: The array index of the reference counter. * - * @param ref_counts The reference counts object - * @param index The array index of the reference counter + * The index is assumed to be valid and in-range. * - * @return the physical block number corresponding to the index - **/ + * Return: The physical block number corresponding to the index. + */ static physical_block_number_t index_to_pbn(const struct ref_counts *ref_counts, uint64_t index) { @@ -86,12 +72,12 @@ index_to_pbn(const struct ref_counts *ref_counts, uint64_t index) /** - * Convert a reference count to a reference status. - * - * @param count The count to convert + * vdo_reference_count_to_status() - Convert a reference count to a reference + * status. + * @count: The count to convert. * - * @return The appropriate reference status - **/ + * Return: The appropriate reference status. + */ static enum reference_status __must_check vdo_reference_count_to_status(vdo_refcount_t count) { @@ -106,46 +92,57 @@ vdo_reference_count_to_status(vdo_refcount_t count) } } -/**********************************************************************/ +/** + * vdo_reset_search_cursor() - Reset the free block search back to the first + * reference counter in the first reference block. + * @ref_counts: The ref_counts object containing the search cursor. + */ void vdo_reset_search_cursor(struct ref_counts *ref_counts) { struct search_cursor *cursor = &ref_counts->search_cursor; cursor->block = cursor->first_block; cursor->index = 0; - // Unit tests have slabs with only one reference block (and it's a - // runt). + /* + * Unit tests have slabs with only one reference block (and it's a + * runt). + */ cursor->end_index = min((uint32_t) COUNTS_PER_BLOCK, ref_counts->block_count); } /** - * Advance the search cursor to the start of the next reference block, - * wrapping around to the first reference block if the current block is the - * last reference block. - * - * @param ref_counts The ref_counts object containing the search cursor + * advance_search_cursor() - Advance the search cursor to the start of the + * next reference block. + * @ref_counts: The ref_counts object containing the search cursor. + * + * Wraps around to the first reference block if the current block is the last + * reference block. * - * @return true unless the cursor was at the last reference block - **/ + * Return: true unless the cursor was at the last reference block. + */ static bool advance_search_cursor(struct ref_counts *ref_counts) { struct search_cursor *cursor = &ref_counts->search_cursor; - // If we just finished searching the last reference block, then wrap - // back around to the start of the array. + /* + * If we just finished searching the last reference block, then wrap + * back around to the start of the array. + */ if (cursor->block == cursor->last_block) { vdo_reset_search_cursor(ref_counts); return false; } - // We're not already at the end, so advance to cursor to the next - // block. + /* + * We're not already at the end, so advance to cursor to the next + * block. + */ cursor->block++; cursor->index = cursor->end_index; if (cursor->block == cursor->last_block) { - // The last reference block will usually be a runt. + /* The last reference block will usually be a runt. */ cursor->end_index = ref_counts->block_count; } else { cursor->end_index += COUNTS_PER_BLOCK; @@ -153,8 +150,22 @@ static bool advance_search_cursor(struct ref_counts *ref_counts) return true; } -/**********************************************************************/ -int make_vdo_ref_counts(block_count_t block_count, +/** + * vdo_make_ref_counts() - Create a reference counting object. + * @block_count: The number of physical blocks that can be referenced. + * @slab: The slab of the ref counts object. + * @origin: The layer PBN at which to save ref_counts. + * @read_only_notifier: The context for tracking read-only mode. + * @ref_counts_ptr: The pointer to hold the new ref counts object. + * + * A reference counting object can keep a reference count for every physical + * block in the VDO configuration. Since we expect the vast majority of the + * blocks to have 0 or 1 reference counts, the structure is optimized for that + * situation. + * + * Return: A success or error code. + */ +int vdo_make_ref_counts(block_count_t block_count, struct vdo_slab *slab, physical_block_number_t origin, struct read_only_notifier *read_only_notifier, @@ -166,22 +177,24 @@ int make_vdo_ref_counts(block_count_t block_count, struct ref_counts *ref_counts; int result = UDS_ALLOCATE_EXTENDED(struct ref_counts, ref_block_count, - struct reference_block, - "ref counts structure", - &ref_counts); + struct reference_block, + "ref counts structure", + &ref_counts); if (result != UDS_SUCCESS) { return result; } - // Allocate such that the runt slab has a full-length memory array, - // plus a little padding so we can word-search even at the very end. + /* + * Allocate such that the runt slab has a full-length memory array, + * plus a little padding so we can word-search even at the very end. + */ bytes = ((ref_block_count * COUNTS_PER_BLOCK) + (2 * BYTES_PER_WORD)); result = UDS_ALLOCATE(bytes, vdo_refcount_t, "ref counts array", &ref_counts->counters); if (result != UDS_SUCCESS) { - free_vdo_ref_counts(ref_counts); + vdo_free_ref_counts(ref_counts); return result; } @@ -207,8 +220,11 @@ int make_vdo_ref_counts(block_count_t block_count, return VDO_SUCCESS; } -/**********************************************************************/ -void free_vdo_ref_counts(struct ref_counts *ref_counts) +/** + * vdo_free_ref_counts() - Free a reference counting object. + * @ref_counts: The object to free. + */ +void vdo_free_ref_counts(struct ref_counts *ref_counts) { if (ref_counts == NULL) { return; @@ -219,21 +235,23 @@ void free_vdo_ref_counts(struct ref_counts *ref_counts) } /** - * Check whether a ref_counts object has active I/O. + * has_active_io() - Check whether a ref_counts object has active I/O. + * @ref_counts: The ref_counts to check. * - * @param ref_counts The ref_counts to check - * - * @return true if there is reference block I/O or a summary - * update in progress - **/ + * Return: true if there is reference block I/O or a summary update in + * progress. + */ static bool __must_check has_active_io(struct ref_counts *ref_counts) { return ((ref_counts->active_count > 0) || ref_counts->updating_slab_summary); } -/**********************************************************************/ -bool are_vdo_ref_counts_active(struct ref_counts *ref_counts) +/** + * vdo_are_ref_counts_active() - Check whether a ref_counts is active. + * @ref_counts: The ref_counts to check. + */ +bool vdo_are_ref_counts_active(struct ref_counts *ref_counts) { const struct admin_state_code *code; @@ -241,42 +259,39 @@ bool are_vdo_ref_counts_active(struct ref_counts *ref_counts) return true; } - // When not suspending or recovering, the ref_counts must be clean. - code = get_vdo_admin_state_code(&ref_counts->slab->state); + /* When not suspending or recovering, the ref_counts must be clean. */ + code = vdo_get_admin_state_code(&ref_counts->slab->state); return (has_waiters(&ref_counts->dirty_blocks) && (code != VDO_ADMIN_STATE_SUSPENDING) && (code != VDO_ADMIN_STATE_RECOVERING)); } -/**********************************************************************/ static void enter_ref_counts_read_only_mode(struct ref_counts *ref_counts, int result) { vdo_enter_read_only_mode(ref_counts->read_only_notifier, result); - check_if_vdo_slab_drained(ref_counts->slab); + vdo_check_if_slab_drained(ref_counts->slab); } /** - * Enqueue a block on the dirty queue. - * - * @param block The block to enqueue - **/ + * enqueue_dirty_block() - Enqueue a block on the dirty queue. + * @block: The block to enqueue. + */ static void enqueue_dirty_block(struct reference_block *block) { int result = enqueue_waiter(&block->ref_counts->dirty_blocks, &block->waiter); if (result != VDO_SUCCESS) { - // This should never happen. + /* This should never happen. */ enter_ref_counts_read_only_mode(block->ref_counts, result); } } /** - * Mark a reference count block as dirty, potentially adding it to the dirty - * queue if it wasn't already dirty. - * - * @param block The reference block to mark as dirty - **/ + * dirty_block() - Mark a reference count block as dirty, potentially adding + * it to the dirty queue if it wasn't already dirty. + * @block: The reference block to mark as dirty. + */ static void dirty_block(struct reference_block *block) { if (block->is_dirty) { @@ -285,15 +300,23 @@ static void dirty_block(struct reference_block *block) block->is_dirty = true; if (block->is_writing) { - // The conclusion of the current write will enqueue the block - // again. + /* + * The conclusion of the current write will enqueue the block + * again. + */ return; } enqueue_dirty_block(block); } -/**********************************************************************/ +/** + * vdo_get_unreferenced_block_count() - Get the stored count of the number of + * blocks that are currently free. + * @ref_counts: The ref_counts object. + * + * Return: The number of blocks with a reference count of zero. + */ block_count_t vdo_get_unreferenced_block_count(struct ref_counts *ref_counts) { @@ -301,11 +324,11 @@ vdo_get_unreferenced_block_count(struct ref_counts *ref_counts) } /** - * Get the reference block that covers the given block index. - * - * @param ref_counts The refcounts object - * @param index The block index - **/ + * vdo_get_reference_block() - Get the reference block that covers the given + * block index. + * @ref_counts: The refcounts object. + * @index: The block index. + */ static struct reference_block * __must_check vdo_get_reference_block(struct ref_counts *ref_counts, slab_block_number index) @@ -314,19 +337,19 @@ vdo_get_reference_block(struct ref_counts *ref_counts, } /** - * Get the reference counter that covers the given physical block number. - * - * @param [in] ref_counts The refcounts object - * @param [in] pbn The physical block number - * @param [out] counter_ptr A pointer to the reference counter - - **/ + * get_reference_counter() - Get the reference counter that covers the given + * physical block number. + * @ref_counts: The refcounts object. + * @pbn: The physical block number. + * @counter_ptr: A pointer to the reference counter. + */ static int get_reference_counter(struct ref_counts *ref_counts, physical_block_number_t pbn, vdo_refcount_t **counter_ptr) { slab_block_number index; int result = vdo_slab_block_number_from_pbn(ref_counts->slab, pbn, &index); + if (result != VDO_SUCCESS) { return result; } @@ -336,12 +359,20 @@ static int get_reference_counter(struct ref_counts *ref_counts, return VDO_SUCCESS; } -/**********************************************************************/ +/** + * vdo_get_available_references() - Determine how many times a reference count + * can be incremented without overflowing. + * @ref_counts: The ref_counts object. + * @pbn: The physical block number. + * + * Return: The number of increments that can be performed. + */ uint8_t vdo_get_available_references(struct ref_counts *ref_counts, physical_block_number_t pbn) { vdo_refcount_t *counter_ptr = NULL; int result = get_reference_counter(ref_counts, pbn, &counter_ptr); + if (result != VDO_SUCCESS) { return 0; } @@ -354,25 +385,18 @@ uint8_t vdo_get_available_references(struct ref_counts *ref_counts, } /** - * Increment the reference count for a data block. + * increment_for_data() - Increment the reference count for a data block. + * @ref_counts: The ref_counts responsible for the block. + * @block: The reference block which contains the block being updated. + * @block_number: The block to update. + * @old_status: The reference status of the data block before this increment. + * @lock: The pbn_lock associated with this increment (may be NULL). + * @counter_ptr: A pointer to the count for the data block (in, out). + * @free_status_changed: A pointer which will be set to true if this update + * changed the free status of the block. * - * @param [in] ref_counts The ref_counts responsible for the - * block - * @param [in] block The reference block which contains the - * block being updated - * @param [in] block_number The block to update - * @param [in] old_status The reference status of the data block - * before this increment - * @param [in] lock The pbn_lock associated with this - * increment (may be NULL) - * @param [in,out] counter_ptr A pointer to the count for the data - * block - * @param [out] free_status_changed A pointer which will be set to true if - * this update changed the free status of - * the block - * - * @return VDO_SUCCESS or an error - **/ + * Return: VDO_SUCCESS or an error. + */ static int increment_for_data(struct ref_counts *ref_counts, struct reference_block *block, slab_block_number block_number, @@ -395,7 +419,7 @@ static int increment_for_data(struct ref_counts *ref_counts, break; default: - // Single or shared + /* Single or shared */ if (*counter_ptr >= MAXIMUM_REFERENCE_COUNT) { return uds_log_error_strerror(VDO_REF_COUNT_INVALID, "Incrementing a block already having 254 references (slab %u, offset %u)", @@ -407,31 +431,25 @@ static int increment_for_data(struct ref_counts *ref_counts, } if (lock != NULL) { - unassign_vdo_pbn_lock_provisional_reference(lock); + vdo_unassign_pbn_lock_provisional_reference(lock); } return VDO_SUCCESS; } /** - * Decrement the reference count for a data block. - * - * @param [in] ref_counts The ref_counts responsible for the - * block - * @param [in] block The reference block which contains the - * block being updated - * @param [in] block_number The block to update - * @param [in] old_status The reference status of the data block - * before this decrement - * @param [in] lock The pbn_lock associated with the block - * being decremented (may be NULL) - * @param [in,out] counter_ptr A pointer to the count for the data - * block - * @param [out] free_status_changed A pointer which will be set to true if - * this update changed the free status of - * the block + * decrement_for_data() - Decrement the reference count for a data block. + * @ref_counts: The ref_counts responsible for the block. + * @block: The reference block which contains the block being updated. + * @block_number: The block to update. + * @old_status: The reference status of the data block before this decrement. + * @lock: The pbn_lock associated with the block being decremented + * (may be NULL). + * @counter_ptr: A pointer to the count for the data block (in, out). + * @free_status_changed: A pointer which will be set to true if this update + * changed the free status of the block. * - * @return VDO_SUCCESS or an error - **/ + * Return: VDO_SUCCESS or an error. + */ static int decrement_for_data(struct ref_counts *ref_counts, struct reference_block *block, slab_block_number block_number, @@ -450,11 +468,13 @@ static int decrement_for_data(struct ref_counts *ref_counts, case RS_PROVISIONAL: case RS_SINGLE: if (lock != NULL) { - // There is a read lock on this block, so the block must - // not become unreferenced. + /* + * There is a read lock on this block, so the block must + * not become unreferenced. + */ *counter_ptr = PROVISIONAL_REFERENCE_COUNT; *free_status_changed = false; - assign_vdo_pbn_lock_provisional_reference(lock); + vdo_assign_pbn_lock_provisional_reference(lock); } else { *counter_ptr = EMPTY_REFERENCE_COUNT; block->allocated_count--; @@ -464,7 +484,7 @@ static int decrement_for_data(struct ref_counts *ref_counts, break; default: - // Shared + /* Shared */ (*counter_ptr)--; *free_status_changed = false; } @@ -473,30 +493,27 @@ static int decrement_for_data(struct ref_counts *ref_counts, } /** - * Increment the reference count for a block map page. All block map - * increments should be from provisional to MAXIMUM_REFERENCE_COUNT. Since - * block map blocks never dedupe they should never be adjusted from any other - * state. The adjustment always results in MAXIMUM_REFERENCE_COUNT as this - * value is used to prevent dedupe against block map blocks. + * increment_for_block_map() - Increment the reference count for a block map + * page. + * @ref_counts: The ref_counts responsible for the block. + * @block: The reference block which contains the block being updated. + * @block_number: The block to update. + * @old_status: The reference status of the block before this increment. + * @lock: The pbn_lock associated with this increment (may be NULL). + * @normal_operation: Whether we are in normal operation vs. recovery or + * rebuild. + * @counter_ptr: A pointer to the count for the block (in, out). + * @free_status_changed: A pointer which will be set to true if this update + * changed the free status of the block. * - * @param [in] ref_counts The ref_counts responsible for the - * block - * @param [in] block The reference block which contains the - * block being updated - * @param [in] block_number The block to update - * @param [in] old_status The reference status of the block - * before this increment - * @param [in] lock The pbn_lock associated with this - * increment (may be NULL) - * @param [in] normal_operation Whether we are in normal operation vs. - * recovery or rebuild - * @param [in,out] counter_ptr A pointer to the count for the block - * @param [out] free_status_changed A pointer which will be set to true if - * this update changed the free status of - * the block + * All block map increments should be from provisional to + * MAXIMUM_REFERENCE_COUNT. Since block map blocks never dedupe they should + * never be adjusted from any other state. The adjustment always results in + * MAXIMUM_REFERENCE_COUNT as this value is used to prevent dedupe against + * block map blocks. * - * @return VDO_SUCCESS or an error - **/ + * Return: VDO_SUCCESS or an error. + */ static int increment_for_block_map(struct ref_counts *ref_counts, struct reference_block *block, slab_block_number block_number, @@ -532,7 +549,7 @@ static int increment_for_block_map(struct ref_counts *ref_counts, *counter_ptr = MAXIMUM_REFERENCE_COUNT; *free_status_changed = false; if (lock != NULL) { - unassign_vdo_pbn_lock_provisional_reference(lock); + vdo_unassign_pbn_lock_provisional_reference(lock); } return VDO_SUCCESS; @@ -546,27 +563,23 @@ static int increment_for_block_map(struct ref_counts *ref_counts, } /** - * Update the reference count of a block. - * - * @param [in] ref_counts The ref_counts responsible for the - * block - * @param [in] block The reference block which contains - * the block being updated - * @param [in] block_number The block to update - * @param [in] slab_journal_point The slab journal point at which this - * update is journaled - * @param [in] operation How to update the count - * @param [in] normal_operation Whether we are in normal operation - * vs. recovery or rebuild - * @param [out] free_status_changed A pointer which will be set to true - * if this update changed the free - * status of the block - * @param [out] provisional_decrement_ptr A pointer which will be set to true - * if this update was a decrement of a - * provisional reference + * update_reference_count() - Update the reference count of a block. + * @ref_counts: The ref_counts responsible for the block. + * @block: The reference block which contains the block being updated. + * @block_number: The block to update. + * @slab_journal_point: The slab journal point at which this update is + * journaled. + * @operation: How to update the count. + * @normal_operation: Whether we are in normal operation vs. recovery + * or rebuild. + * @free_status_changed: A pointer which will be set to true if this update + * changed the free status of the block. + * @provisional_decrement_ptr: A pointer which will be set to true if this + * update was a decrement of a provisional + * reference. * - * @return VDO_SUCCESS or an error - **/ + * Return: VDO_SUCCESS or an error. + */ static int update_reference_count(struct ref_counts *ref_counts, struct reference_block *block, @@ -580,11 +593,11 @@ update_reference_count(struct ref_counts *ref_counts, vdo_refcount_t *counter_ptr = &ref_counts->counters[block_number]; enum reference_status old_status = vdo_reference_count_to_status(*counter_ptr); - struct pbn_lock *lock = get_vdo_reference_operation_pbn_lock(operation); + struct pbn_lock *lock = vdo_get_reference_operation_pbn_lock(operation); int result; switch (operation.type) { - case DATA_INCREMENT: + case VDO_JOURNAL_DATA_INCREMENT: result = increment_for_data(ref_counts, block, block_number, @@ -594,7 +607,7 @@ update_reference_count(struct ref_counts *ref_counts, free_status_changed); break; - case DATA_DECREMENT: + case VDO_JOURNAL_DATA_DECREMENT: result = decrement_for_data(ref_counts, block, block_number, @@ -610,7 +623,7 @@ update_reference_count(struct ref_counts *ref_counts, } break; - case BLOCK_MAP_INCREMENT: + case VDO_JOURNAL_BLOCK_MAP_INCREMENT: result = increment_for_block_map(ref_counts, block, block_number, @@ -632,14 +645,25 @@ update_reference_count(struct ref_counts *ref_counts, return result; } - if (is_valid_vdo_journal_point(slab_journal_point)) { + if (vdo_is_valid_journal_point(slab_journal_point)) { ref_counts->slab_journal_point = *slab_journal_point; } return VDO_SUCCESS; } -/**********************************************************************/ +/** + * vdo_adjust_reference_count() - Adjust the reference count of a block. + * @ref_counts: The refcounts object. + * @operation: The operation to perform. + * @slab_journal_point: The slab journal entry for this adjustment. + * @free_status_changed: A pointer which will be set to true if the free status + * of the block changed. + * + * Return: A success or error code, specifically: VDO_REF_COUNT_INVALID if a + * decrement would result in a negative reference count, or an + * increment in a count greater than MAXIMUM_REFS + */ int vdo_adjust_reference_count(struct ref_counts *ref_counts, struct reference_operation operation, const struct journal_point *slab_journal_point, @@ -650,7 +674,7 @@ int vdo_adjust_reference_count(struct ref_counts *ref_counts, struct reference_block *block; bool provisional_decrement = false; - if (!is_vdo_slab_open(ref_counts->slab)) { + if (!vdo_is_slab_open(ref_counts->slab)) { return VDO_INVALID_ADMIN_STATE; } @@ -679,13 +703,13 @@ int vdo_adjust_reference_count(struct ref_counts *ref_counts, * release the per-entry slab journal lock for the entry * associated with the update we are now doing. */ - result = ASSERT(is_valid_vdo_journal_point(slab_journal_point), + result = ASSERT(vdo_is_valid_journal_point(slab_journal_point), "Reference count adjustments need slab journal points."); if (result != VDO_SUCCESS) { return result; } - adjust_vdo_slab_journal_block_reference(ref_counts->slab->journal, + vdo_adjust_slab_journal_block_reference(ref_counts->slab->journal, entry_lock, -1); return VDO_SUCCESS; @@ -697,7 +721,7 @@ int vdo_adjust_reference_count(struct ref_counts *ref_counts, * cleaned. Therefore, we convert the per-entry slab journal lock to an * uncommitted reference block lock, if there is a per-entry lock. */ - if (is_valid_vdo_journal_point(slab_journal_point)) { + if (vdo_is_valid_journal_point(slab_journal_point)) { block->slab_journal_lock = slab_journal_point->sequence_number; } else { block->slab_journal_lock = 0; @@ -707,7 +731,15 @@ int vdo_adjust_reference_count(struct ref_counts *ref_counts, return VDO_SUCCESS; } -/**********************************************************************/ +/** + * vdo_adjust_reference_count_for_rebuild() - Adjust the reference count of a + * block during rebuild. + * @ref_counts: The refcounts object. + * @pbn: The number of the block to adjust. + * @operation: The operation to perform on the count. + * + * Return: VDO_SUCCESS or an error. + */ int vdo_adjust_reference_count_for_rebuild(struct ref_counts *ref_counts, physical_block_number_t pbn, enum journal_operation operation) @@ -739,7 +771,19 @@ int vdo_adjust_reference_count_for_rebuild(struct ref_counts *ref_counts, return VDO_SUCCESS; } -/**********************************************************************/ +/** + * vdo_replay_reference_count_change() - Replay the reference count adjustment + * from a slab journal entry into the + * reference count for a block. + * @ref_counts: The refcounts object. + * @entry_point: The slab journal point for the entry. + * @entry: The slab journal entry being replayed. + * + * The adjustment will be ignored if it was already recorded in the reference + * count. + * + * Return: VDO_SUCCESS or an error code. + */ int vdo_replay_reference_count_change(struct ref_counts *ref_counts, const struct journal_point *entry_point, struct slab_journal_entry entry) @@ -752,13 +796,15 @@ int vdo_replay_reference_count_change(struct ref_counts *ref_counts, COUNTS_PER_SECTOR; struct reference_operation operation = { .type = entry.operation }; - if (!before_vdo_journal_point(&block->commit_points[sector], entry_point)) { - // This entry is already reflected in the existing counts, so - // do nothing. + if (!vdo_before_journal_point(&block->commit_points[sector], entry_point)) { + /* + * This entry is already reflected in the existing counts, so + * do nothing. + */ return VDO_SUCCESS; } - // This entry is not yet counted in the reference counts. + /* This entry is not yet counted in the reference counts. */ result = update_reference_count(ref_counts, block, entry.sbn, entry_point, operation, !NORMAL_OPERATION, @@ -773,17 +819,18 @@ int vdo_replay_reference_count_change(struct ref_counts *ref_counts, /** - * Find the array index of the first zero byte in word-sized range of - * reference counters. The search does no bounds checking; the function relies - * on the array being sufficiently padded. + * find_zero_byte_in_word() - Find the array index of the first zero byte in + * word-sized range of reference counters. + * @word_ptr: A pointer to the eight counter bytes to check. + * @start_index: The array index corresponding to word_ptr[0]. + * @fail_index: The array index to return if no zero byte is found. * - * @param word_ptr A pointer to the eight counter bytes to check - * @param start_index The array index corresponding to word_ptr[0] - * @param fail_index The array index to return if no zero byte is found - - * @return the array index of the first zero byte in the word, or - * the value passed as fail_index if no zero byte was found - **/ + * The search does no bounds checking; the function relies on the array being + * sufficiently padded. + * + * Return: The array index of the first zero byte in the word, or + * the value passed as fail_index if no zero byte was found. + */ static inline slab_block_number find_zero_byte_in_word(const byte *word_ptr, slab_block_number start_index, @@ -791,11 +838,14 @@ find_zero_byte_in_word(const byte *word_ptr, { uint64_t word = get_unaligned_le64(word_ptr); - // This looks like a loop, but GCC will unroll the eight iterations for - // us. + /* + * This looks like a loop, but GCC will unroll the eight iterations for + * us. + */ unsigned int offset; + for (offset = 0; offset < BYTES_PER_WORD; offset++) { - // Assumes little-endian byte order, which we have on X86. + /* Assumes little-endian byte order, which we have on X86. */ if ((word & 0xFF) == 0) { return (start_index + offset); } @@ -805,7 +855,20 @@ find_zero_byte_in_word(const byte *word_ptr, return fail_index; } -/**********************************************************************/ +/** + * vdo_find_free_block() - Find the first block with a reference count of zero + * in the specified range of reference counter indexes. + * @ref_counts: The reference counters to scan. + * @start_index: The array index at which to start scanning (included in the + * scan). + * @end_index: The array index at which to stop scanning (excluded from the + * scan). + * @index_ptr: A pointer to hold the array index of the free block. + * + * Exposed for unit testing. + * + * Return: true if a free block was found in the specified range. + */ static bool vdo_find_free_block(const struct ref_counts *ref_counts, slab_block_number start_index, @@ -817,8 +880,10 @@ bool vdo_find_free_block(const struct ref_counts *ref_counts, byte *next_counter = &ref_counts->counters[next_index]; byte *end_counter = &ref_counts->counters[end_index]; - // Search every byte of the first unaligned word. (Array is padded so - // reading past end is safe.) + /* + * Search every byte of the first unaligned word. (Array is padded so + * reading past end is safe.) + */ zero_index = find_zero_byte_in_word(next_counter, next_index, end_index); if (zero_index < end_index) { @@ -826,13 +891,17 @@ bool vdo_find_free_block(const struct ref_counts *ref_counts, return true; } - // On architectures where unaligned word access is expensive, this - // would be a good place to advance to an alignment boundary. + /* + * On architectures where unaligned word access is expensive, this + * would be a good place to advance to an alignment boundary. + */ next_index += BYTES_PER_WORD; next_counter += BYTES_PER_WORD; - // Now we're word-aligned; check an word at a time until we find a word - // containing a zero. (Array is padded so reading past end is safe.) + /* + * Now we're word-aligned; check an word at a time until we find a word + * containing a zero. (Array is padded so reading past end is safe.) + */ while (next_counter < end_counter) { /* * The following code is currently an exact copy of the code @@ -856,19 +925,20 @@ bool vdo_find_free_block(const struct ref_counts *ref_counts, } /** - * Search the reference block currently saved in the search cursor for a - * reference count of zero, starting at the saved counter index. - * - * @param [in] ref_counts The ref_counts object to search - * @param [out] free_index_ptr A pointer to receive the array index of the - * zero reference count + * search_current_reference_block() - Search the reference block currently + * saved in the search cursor for a + * reference count of zero, starting at the + * saved counter index. + * @ref_counts: The ref_counts object to search. + * @free_index_ptr: A pointer to receive the array index of the zero reference + * count. * - * @return true if an unreferenced counter was found - **/ + * Return: true if an unreferenced counter was found. + */ static bool search_current_reference_block(const struct ref_counts *ref_counts, slab_block_number *free_index_ptr) { - // Don't bother searching if the current block is known to be full. + /* Don't bother searching if the current block is known to be full. */ return ((ref_counts->search_cursor.block->allocated_count < COUNTS_PER_BLOCK) && vdo_find_free_block(ref_counts, @@ -878,25 +948,28 @@ static bool search_current_reference_block(const struct ref_counts *ref_counts, } /** - * Search each reference block for a reference count of zero, starting at the - * reference block and counter index saved in the search cursor and searching - * up to the end of the last reference block. The search does not wrap. + * search_reference_blocks() - Search each reference block for a reference + * count of zero. + * @ref_counts: The ref_counts object to search. + * @free_index_ptr: A pointer to receive the array index of the zero + * reference count. * - * @param [in] ref_counts The ref_counts object to search - * @param [out] free_index_ptr A pointer to receive the array index of the - * zero reference count + * Searches each reference block for a reference count of zero, starting at + * the reference block and counter index saved in the search cursor and + * searching up to the end of the last reference block. The search does not + * wrap. * - * @return true if an unreferenced counter was found - **/ + * Return: true if an unreferenced counter was found. + */ static bool search_reference_blocks(struct ref_counts *ref_counts, slab_block_number *free_index_ptr) { - // Start searching at the saved search position in the current block. + /* Start searching at the saved search position in the current block. */ if (search_current_reference_block(ref_counts, free_index_ptr)) { return true; } - // Search each reference block up to the end of the slab. + /* Search each reference block up to the end of the slab. */ while (advance_search_cursor(ref_counts)) { if (search_current_reference_block(ref_counts, free_index_ptr)) { return true; @@ -907,32 +980,48 @@ static bool search_reference_blocks(struct ref_counts *ref_counts, } /** - * Do the bookkeeping for making a provisional reference. - * - * @param ref_counts The ref_counts - * @param block_number The block to reference - **/ + * make_provisional_reference() - Do the bookkeeping for making a provisional + * reference. + * @ref_counts: The ref_counts. + * @block_number: The block to reference. + */ static void make_provisional_reference(struct ref_counts *ref_counts, slab_block_number block_number) { struct reference_block *block = vdo_get_reference_block(ref_counts, block_number); - // Make the initial transition from an unreferenced block to a - // provisionally allocated block. + /* + * Make the initial transition from an unreferenced block to a + * provisionally allocated block. + */ ref_counts->counters[block_number] = PROVISIONAL_REFERENCE_COUNT; - // Account for the allocation. + /* Account for the allocation. */ block->allocated_count++; ref_counts->free_blocks--; } -/**********************************************************************/ +/** + * vdo_allocate_unreferenced_block() - Find a block with a reference count of + * zero in the range of physical block + * numbers tracked by the reference + * counter. + * @ref_counts: The reference counters to scan. + * @allocated_ptr: A pointer to hold the physical block number of the block + * that was found and allocated. + * + * If a free block is found, that block is allocated by marking it as + * provisionally referenced, and the allocated block number is returned. + * + * Return: VDO_SUCCESS if a free block was found and allocated; VDO_NO_SPACE + * if there are no unreferenced blocks; otherwise an error code. + */ int vdo_allocate_unreferenced_block(struct ref_counts *ref_counts, physical_block_number_t *allocated_ptr) { slab_block_number free_index; - if (!is_vdo_slab_open(ref_counts->slab)) { + if (!vdo_is_slab_open(ref_counts->slab)) { return VDO_INVALID_ADMIN_STATE; } @@ -945,15 +1034,25 @@ int vdo_allocate_unreferenced_block(struct ref_counts *ref_counts, "free block must have ref count of zero"); make_provisional_reference(ref_counts, free_index); - // Update the search hint so the next search will start at the array - // index just past the free block we just found. + /* + * Update the search hint so the next search will start at the array + * index just past the free block we just found. + */ ref_counts->search_cursor.index = (free_index + 1); *allocated_ptr = index_to_pbn(ref_counts, free_index); return VDO_SUCCESS; } -/**********************************************************************/ +/** + * vdo_provisionally_reference_block() - Provisionally reference a block if it + * is unreferenced. + * @ref_counts: The reference counters. + * @pbn: The PBN to reference. + * @lock: The pbn_lock on the block (may be NULL). + * + * Return: VDO_SUCCESS or an error. + */ int vdo_provisionally_reference_block(struct ref_counts *ref_counts, physical_block_number_t pbn, struct pbn_lock *lock) @@ -961,7 +1060,7 @@ int vdo_provisionally_reference_block(struct ref_counts *ref_counts, slab_block_number block_number; int result; - if (!is_vdo_slab_open(ref_counts->slab)) { + if (!vdo_is_slab_open(ref_counts->slab)) { return VDO_INVALID_ADMIN_STATE; } @@ -974,7 +1073,7 @@ int vdo_provisionally_reference_block(struct ref_counts *ref_counts, if (ref_counts->counters[block_number] == EMPTY_REFERENCE_COUNT) { make_provisional_reference(ref_counts, block_number); if (lock != NULL) { - assign_vdo_pbn_lock_provisional_reference(lock); + vdo_assign_pbn_lock_provisional_reference(lock); } } @@ -983,13 +1082,12 @@ int vdo_provisionally_reference_block(struct ref_counts *ref_counts, /** - * Convert a reference_block's generic wait queue entry back into the - * reference_block. + * Waiter_as_reference_block() - Convert a reference_block's generic wait + * queue entry back into the reference_block. + * @waiter: The wait queue entry to convert. * - * @param waiter The wait queue entry to convert - * - * @return The wrapping reference_block - **/ + * Return: The wrapping reference_block. + */ static inline struct reference_block * waiter_as_reference_block(struct waiter *waiter) { @@ -998,16 +1096,18 @@ waiter_as_reference_block(struct waiter *waiter) /** - * A waiter callback that resets the writing state of ref_counts. - **/ + * finish_summary_update() - A waiter callback that resets the writing state + * of ref_counts. + */ static void finish_summary_update(struct waiter *waiter, void *context) { struct ref_counts *ref_counts = ref_counts_from_waiter(waiter); int result = *((int *)context); + ref_counts->updating_slab_summary = false; if ((result == VDO_SUCCESS) || (result == VDO_READ_ONLY)) { - check_if_vdo_slab_drained(ref_counts->slab); + vdo_check_if_slab_drained(ref_counts->slab); return; } @@ -1016,20 +1116,20 @@ static void finish_summary_update(struct waiter *waiter, void *context) } /** - * Update slab summary that the ref_counts object is clean. - * - * @param ref_counts The ref_counts object that is being written - **/ + * update_slab_summary_as_clean() - Update slab summary that the ref_counts + * object is clean. + * @ref_counts: The ref_counts object that is being written. + */ static void update_slab_summary_as_clean(struct ref_counts *ref_counts) { tail_block_offset_t offset; struct slab_summary_zone *summary = - get_vdo_slab_summary_zone(ref_counts->slab->allocator); + ref_counts->slab->allocator->summary; if (summary == NULL) { return; } - // Update the slab summary to indicate this ref_counts is clean. + /* Update the slab summary to indicate this ref_counts is clean. */ offset = vdo_get_summarized_tail_block_offset(summary, ref_counts->slab->slab_number); ref_counts->updating_slab_summary = true; @@ -1044,39 +1144,42 @@ static void update_slab_summary_as_clean(struct ref_counts *ref_counts) } /** - * Handle an I/O error reading or writing a reference count block. - * - * @param completion The VIO doing the I/O as a completion - **/ + * handle_io_error() - Handle an I/O error reading or writing a reference + * count block. + * @completion: The VIO doing the I/O as a completion. + */ static void handle_io_error(struct vdo_completion *completion) { int result = completion->result; struct vio_pool_entry *entry = completion->parent; struct ref_counts *ref_counts = - ((struct reference_block *)entry->parent)->ref_counts; - return_vdo_block_allocator_vio(ref_counts->slab->allocator, entry); + ((struct reference_block *) entry->parent)->ref_counts; + + record_metadata_io_error(as_vio(completion)); + vdo_return_block_allocator_vio(ref_counts->slab->allocator, entry); ref_counts->active_count--; enter_ref_counts_read_only_mode(ref_counts, result); } /** - * After a reference block has written, clean it, release its locks, and return - * its VIO to the pool. - * - * @param completion The VIO that just finished writing - **/ + * finish_reference_block_write() - After a reference block has written, clean + * it, release its locks, and return its VIO + * to the pool. + * @completion: The VIO that just finished writing. + */ static void finish_reference_block_write(struct vdo_completion *completion) { struct vio_pool_entry *entry = completion->parent; struct reference_block *block = entry->parent; struct ref_counts *ref_counts = block->ref_counts; + ref_counts->active_count--; - // Release the slab journal lock. - adjust_vdo_slab_journal_block_reference(ref_counts->slab->journal, + /* Release the slab journal lock. */ + vdo_adjust_slab_journal_block_reference(ref_counts->slab->journal, block->slab_journal_lock_to_release, -1); - return_vdo_block_allocator_vio(ref_counts->slab->allocator, entry); + vdo_return_block_allocator_vio(ref_counts->slab->allocator, entry); /* * We can't clear the is_writing flag earlier as releasing the slab @@ -1086,24 +1189,28 @@ static void finish_reference_block_write(struct vdo_completion *completion) block->is_writing = false; if (vdo_is_read_only(ref_counts->read_only_notifier)) { - check_if_vdo_slab_drained(ref_counts->slab); + vdo_check_if_slab_drained(ref_counts->slab); return; } - // Re-queue the block if it was re-dirtied while it was writing. + /* Re-queue the block if it was re-dirtied while it was writing. */ if (block->is_dirty) { enqueue_dirty_block(block); - if (is_vdo_slab_draining(ref_counts->slab)) { - // We must be saving, and this block will otherwise not - // be relaunched. + if (vdo_is_slab_draining(ref_counts->slab)) { + /* + * We must be saving, and this block will otherwise not + * be relaunched. + */ vdo_save_dirty_reference_blocks(ref_counts); } return; } - // Mark the ref_counts as clean in the slab summary if there are no - // dirty or writing blocks and no summary update in progress. + /* + * Mark the ref_counts as clean in the slab summary if there are no + * dirty or writing blocks and no summary update in progress. + */ if (!has_active_io(ref_counts) && !has_waiters(&ref_counts->dirty_blocks)) { update_slab_summary_as_clean(ref_counts); @@ -1111,25 +1218,26 @@ static void finish_reference_block_write(struct vdo_completion *completion) } /** - * Find the reference counters for a given block. + * vdo_get_reference_counters_for_block() - Find the reference counters for a + * given block. + * @block: The reference_block in question. * - * @param block The reference_block in question - * - * @return A pointer to the reference counters for this block - **/ + * Return: A pointer to the reference counters for this block. + */ static vdo_refcount_t * __must_check vdo_get_reference_counters_for_block(struct reference_block *block) { size_t block_index = block - block->ref_counts->blocks; + return &block->ref_counts->counters[block_index * COUNTS_PER_BLOCK]; } /** - * Copy data from a reference block to a buffer ready to be written out. - * - * @param block The block to copy - * @param buffer The char buffer to fill with the packed block - **/ + * vdo_pack_reference_block() - Copy data from a reference block to a buffer + * ready to be written out. + * @block: The block to copy. + * @buffer: The char buffer to fill with the packed block. + */ static void vdo_pack_reference_block(struct reference_block *block, void *buffer) { @@ -1137,7 +1245,8 @@ vdo_pack_reference_block(struct reference_block *block, void *buffer) vdo_refcount_t *counters = vdo_get_reference_counters_for_block(block); sector_count_t i; struct packed_journal_point commit_point; - pack_vdo_journal_point(&block->ref_counts->slab_journal_point, + + vdo_pack_journal_point(&block->ref_counts->slab_journal_point, &commit_point); for (i = 0; i < VDO_SECTORS_PER_BLOCK; i++) { @@ -1148,13 +1257,23 @@ vdo_pack_reference_block(struct reference_block *block, void *buffer) } } +static void write_reference_block_endio(struct bio *bio) +{ + struct vio *vio = bio->bi_private; + struct vio_pool_entry *entry = vio->completion.parent; + struct reference_block *block = entry->parent; + thread_id_t thread_id = block->ref_counts->slab->allocator->thread_id; + + continue_vio_after_io(vio, finish_reference_block_write, thread_id); +} + /** - * After a dirty block waiter has gotten a VIO from the VIO pool, copy its - * counters and associated data into the VIO, and launch the write. - * - * @param block_waiter The waiter of the dirty block - * @param vio_context The VIO returned by the pool - **/ + * write_reference_block() - After a dirty block waiter has gotten a VIO from + * the VIO pool, copy its counters and associated + * data into the VIO, and launch the write. + * @block_waiter: The waiter of the dirty block. + * @vio_context: The VIO returned by the pool. + */ static void write_reference_block(struct waiter *block_waiter, void *vio_context) { @@ -1163,6 +1282,7 @@ static void write_reference_block(struct waiter *block_waiter, struct vio_pool_entry *entry = vio_context; struct reference_block *block = waiter_as_reference_block(block_waiter); + vdo_pack_reference_block(block, entry->buffer); block_offset = (block - block->ref_counts->blocks); @@ -1177,35 +1297,39 @@ static void write_reference_block(struct waiter *block_waiter, */ block->is_dirty = false; - // Flush before writing to ensure that the recovery journal and slab - // journal entries which cover this reference update are stable - // (VDO-2331). + /* + * Flush before writing to ensure that the recovery journal and slab + * journal entries which cover this reference update are stable + * (VDO-2331). + */ WRITE_ONCE(block->ref_counts->statistics->blocks_written, block->ref_counts->statistics->blocks_written + 1); entry->vio->completion.callback_thread_id = block->ref_counts->slab->allocator->thread_id; - launch_write_metadata_vio_with_flush(entry->vio, - pbn, - finish_reference_block_write, - handle_io_error, - true, - false); + submit_metadata_vio(entry->vio, + pbn, + write_reference_block_endio, + handle_io_error, + REQ_OP_WRITE | REQ_PREFLUSH); } /** - * Launch the write of a dirty reference block by first acquiring a VIO for it - * from the pool. This can be asynchronous since the writer will have to wait - * if all VIOs in the pool are currently in use. + * launch_reference_block_write() - Launch the write of a dirty reference + * block by first acquiring a VIO for it from + * the pool. + * @block_waiter: The waiter of the block which is starting to write. + * @context: The parent ref_counts of the block. * - * @param block_waiter The waiter of the block which is starting to write - * @param context The parent ref_counts of the block - **/ + * This can be asynchronous since the writer will have to wait if all VIOs in + * the pool are currently in use. + */ static void launch_reference_block_write(struct waiter *block_waiter, void *context) { struct reference_block *block; int result; struct ref_counts *ref_counts = context; + if (vdo_is_read_only(ref_counts->read_only_notifier)) { return; } @@ -1214,16 +1338,20 @@ static void launch_reference_block_write(struct waiter *block_waiter, block = waiter_as_reference_block(block_waiter); block->is_writing = true; block_waiter->callback = write_reference_block; - result = acquire_vdo_block_allocator_vio(ref_counts->slab->allocator, + result = vdo_acquire_block_allocator_vio(ref_counts->slab->allocator, block_waiter); if (result != VDO_SUCCESS) { - // This should never happen. + /* This should never happen. */ ref_counts->active_count--; enter_ref_counts_read_only_mode(ref_counts, result); } } -/**********************************************************************/ +/** + * vdo_save_oldest_reference_block() - Request a ref_counts object save its + * oldest dirty block asynchronously. + * @ref_counts: The ref_counts object to notify. + */ static void vdo_save_oldest_reference_block(struct ref_counts *ref_counts) { @@ -1232,7 +1360,14 @@ void vdo_save_oldest_reference_block(struct ref_counts *ref_counts) ref_counts); } -/**********************************************************************/ +/** + * vdo_save_several_reference_blocks() - Request a ref_counts object save + * several dirty blocks asynchronously. + * @ref_counts: The ref_counts object to notify. + * @flush_divisor: The inverse fraction of the dirty blocks to write. + * + * This function currently writes 1 / flush_divisor of the dirty blocks. + */ void vdo_save_several_reference_blocks(struct ref_counts *ref_counts, size_t flush_divisor) { @@ -1244,7 +1379,7 @@ void vdo_save_several_reference_blocks(struct ref_counts *ref_counts, } blocks_to_write = dirty_block_count / flush_divisor; - // Always save at least one block. + /* Always save at least one block. */ if (blocks_to_write == 0) { blocks_to_write = 1; } @@ -1254,33 +1389,43 @@ void vdo_save_several_reference_blocks(struct ref_counts *ref_counts, } } -/**********************************************************************/ +/** + * vdo_save_dirty_reference_blocks() - Ask a ref_counts object to save all its + * dirty blocks asynchronously. + * @ref_counts: The ref_counts object to notify. + */ void vdo_save_dirty_reference_blocks(struct ref_counts *ref_counts) { notify_all_waiters(&ref_counts->dirty_blocks, launch_reference_block_write, ref_counts); - check_if_vdo_slab_drained(ref_counts->slab); + vdo_check_if_slab_drained(ref_counts->slab); } -/**********************************************************************/ +/** + * vdo_dirty_all_reference_blocks() - Mark all reference count blocks as + * dirty. + * @ref_counts: The ref_counts of the reference blocks. + */ void vdo_dirty_all_reference_blocks(struct ref_counts *ref_counts) { block_count_t i; + for (i = 0; i < ref_counts->reference_block_count; i++) { dirty_block(&ref_counts->blocks[i]); } } /** - * Clear the provisional reference counts from a reference block. - * - * @param block The block to clear - **/ + * clear_provisional_references() - Clear the provisional reference counts + * from a reference block. + * @block: The block to clear. + */ static void clear_provisional_references(struct reference_block *block) { vdo_refcount_t *counters = vdo_get_reference_counters_for_block(block); block_count_t j; + for (j = 0; j < COUNTS_PER_BLOCK; j++) { if (counters[j] == PROVISIONAL_REFERENCE_COUNT) { counters[j] = EMPTY_REFERENCE_COUNT; @@ -1290,11 +1435,11 @@ static void clear_provisional_references(struct reference_block *block) } /** - * Unpack reference counts blocks into the internal memory structure. - * - * @param packed The written reference block to be unpacked - * @param block The internal reference block to be loaded - **/ + * unpack_reference_block() - Unpack reference counts blocks into the internal + * memory structure. + * @packed: The written reference block to be unpacked. + * @block: The internal reference block to be loaded. + */ static void unpack_reference_block(struct packed_reference_block *packed, struct reference_block *block) { @@ -1302,25 +1447,30 @@ static void unpack_reference_block(struct packed_reference_block *packed, sector_count_t i; struct ref_counts *ref_counts = block->ref_counts; vdo_refcount_t *counters = vdo_get_reference_counters_for_block(block); + for (i = 0; i < VDO_SECTORS_PER_BLOCK; i++) { struct packed_reference_sector *sector = &packed->sectors[i]; - unpack_vdo_journal_point(§or->commit_point, + + vdo_unpack_journal_point(§or->commit_point, &block->commit_points[i]); memcpy(counters + (i * COUNTS_PER_SECTOR), sector->counts, (sizeof(vdo_refcount_t) * COUNTS_PER_SECTOR)); - // The slab_journal_point must be the latest point found in any - // sector. - if (before_vdo_journal_point(&ref_counts->slab_journal_point, + /* + * The slab_journal_point must be the latest point found in any + * sector. + */ + if (vdo_before_journal_point(&ref_counts->slab_journal_point, &block->commit_points[i])) { ref_counts->slab_journal_point = block->commit_points[i]; } if ((i > 0) && - !are_equivalent_vdo_journal_points(&block->commit_points[0], + !vdo_are_equivalent_journal_points(&block->commit_points[0], &block->commit_points[i])) { size_t block_index = block - block->ref_counts->blocks; + uds_log_warning("Torn write detected in sector %u of reference block %zu of slab %u", i, block_index, @@ -1337,65 +1487,77 @@ static void unpack_reference_block(struct packed_reference_block *packed, } /** - * After a reference block has been read, unpack it. - * - * @param completion The VIO that just finished reading - **/ + * finish_reference_block_load() - After a reference block has been read, + * unpack it. + * @completion: The VIO that just finished reading. + */ static void finish_reference_block_load(struct vdo_completion *completion) { struct vio_pool_entry *entry = completion->parent; struct reference_block *block = entry->parent; struct ref_counts *ref_counts = block->ref_counts; + unpack_reference_block((struct packed_reference_block *)entry->buffer, block); - return_vdo_block_allocator_vio(ref_counts->slab->allocator, entry); + vdo_return_block_allocator_vio(ref_counts->slab->allocator, entry); ref_counts->active_count--; clear_provisional_references(block); ref_counts->free_blocks -= block->allocated_count; - check_if_vdo_slab_drained(block->ref_counts->slab); + vdo_check_if_slab_drained(block->ref_counts->slab); +} + +static void load_reference_block_endio(struct bio *bio) +{ + struct vio *vio = bio->bi_private; + struct vio_pool_entry *entry = vio->completion.parent; + struct reference_block *block = entry->parent; + thread_id_t thread_id = block->ref_counts->slab->allocator->thread_id; + + continue_vio_after_io(vio, finish_reference_block_load, thread_id); } /** - * After a block waiter has gotten a VIO from the VIO pool, load the block. - * - * @param block_waiter The waiter of the block to load - * @param vio_context The VIO returned by the pool - **/ + * load_reference_block() - After a block waiter has gotten a VIO from the VIO + * pool, load the block. + * @block_waiter: The waiter of the block to load. + * @vio_context: The VIO returned by the pool. + */ static void load_reference_block(struct waiter *block_waiter, void *vio_context) { struct vio_pool_entry *entry = vio_context; struct reference_block *block = waiter_as_reference_block(block_waiter); size_t block_offset = (block - block->ref_counts->blocks); - physical_block_number_t pbn = (block->ref_counts->origin + block_offset); - entry->parent = block; - entry->vio->completion.callback_thread_id = - block->ref_counts->slab->allocator->thread_id; - launch_read_metadata_vio(entry->vio, pbn, finish_reference_block_load, - handle_io_error); + entry->parent = block; + submit_metadata_vio(entry->vio, + block->ref_counts->origin + block_offset, + load_reference_block_endio, + handle_io_error, + REQ_OP_READ); } /** - * Load reference blocks from the underlying storage into a pre-allocated - * reference counter. - * - * @param ref_counts The reference counter to be loaded - **/ + * load_reference_blocks() - Load reference blocks from the underlying storage + * into a pre-allocated reference counter. + * @ref_counts: The reference counter to be loaded. + */ static void load_reference_blocks(struct ref_counts *ref_counts) { block_count_t i; + ref_counts->free_blocks = ref_counts->block_count; ref_counts->active_count = ref_counts->reference_block_count; for (i = 0; i < ref_counts->reference_block_count; i++) { int result; struct waiter *block_waiter = &ref_counts->blocks[i].waiter; + block_waiter->callback = load_reference_block; - result = acquire_vdo_block_allocator_vio(ref_counts->slab->allocator, + result = vdo_acquire_block_allocator_vio(ref_counts->slab->allocator, block_waiter); if (result != VDO_SUCCESS) { - // This should never happen. + /* This should never happen. */ ref_counts->active_count -= (ref_counts->reference_block_count - i); enter_ref_counts_read_only_mode(ref_counts, result); @@ -1404,13 +1566,20 @@ static void load_reference_blocks(struct ref_counts *ref_counts) } } -/**********************************************************************/ -void drain_vdo_ref_counts(struct ref_counts *ref_counts) +/** + * vdo_drain_ref_counts() - Drain all reference count I/O. + * @ref_counts: The reference counts to drain. + * + * Depending upon the type of drain being performed (as recorded in the + * ref_count's vdo_slab), the reference blocks may be loaded from disk or + * dirty reference blocks may be written out. + */ +void vdo_drain_ref_counts(struct ref_counts *ref_counts) { struct vdo_slab *slab = ref_counts->slab; bool save = false; const struct admin_state_code *state - = get_vdo_admin_state_code(&slab->state); + = vdo_get_admin_state_code(&slab->state); if ((state == VDO_ADMIN_STATE_RECOVERING) || (state == VDO_ADMIN_STATE_SUSPENDING)) { @@ -1426,21 +1595,23 @@ void drain_vdo_ref_counts(struct ref_counts *ref_counts) } else if (state == VDO_ADMIN_STATE_SAVE_FOR_SCRUBBING) { if (!vdo_must_load_ref_counts(slab->allocator->summary, slab->slab_number)) { - // These reference counts were never written, so mark - // them all dirty. + /* + * These reference counts were never written, so mark + * them all dirty. + */ vdo_dirty_all_reference_blocks(ref_counts); } save = true; } else if (state == VDO_ADMIN_STATE_REBUILDING) { - if (should_save_fully_built_vdo_slab(slab)) { + if (vdo_should_save_fully_built_slab(slab)) { vdo_dirty_all_reference_blocks(ref_counts); save = true; } } else if (state == VDO_ADMIN_STATE_SAVING) { - save = !is_unrecovered_vdo_slab(slab); + save = !vdo_is_unrecovered_slab(slab); } else { - notify_vdo_slab_ref_counts_are_drained(slab, VDO_SUCCESS); + vdo_notify_slab_ref_counts_are_drained(slab, VDO_SUCCESS); return; } @@ -1449,23 +1620,32 @@ void drain_vdo_ref_counts(struct ref_counts *ref_counts) } } -/**********************************************************************/ +/** + * vdo_acquire_dirty_block_locks() - Mark all reference count blocks dirty and + * cause them to hold locks on slab journal + * block 1. + * @ref_counts: The ref_counts of the reference blocks. + */ void vdo_acquire_dirty_block_locks(struct ref_counts *ref_counts) { block_count_t i; + vdo_dirty_all_reference_blocks(ref_counts); for (i = 0; i < ref_counts->reference_block_count; i++) { ref_counts->blocks[i].slab_journal_lock = 1; } - adjust_vdo_slab_journal_block_reference(ref_counts->slab->journal, 1, + vdo_adjust_slab_journal_block_reference(ref_counts->slab->journal, 1, ref_counts->reference_block_count); } -/**********************************************************************/ -void dump_vdo_ref_counts(const struct ref_counts *ref_counts) +/** + * vdo_dump_ref_counts() - Dump information about this ref_counts structure. + * @ref_counts: The ref_counts to dump. + */ +void vdo_dump_ref_counts(const struct ref_counts *ref_counts) { - // Terse because there are a lot of slabs to dump and syslog is lossy. + /* Terse because there are a lot of slabs to dump and syslog is lossy. */ uds_log_info(" ref_counts: free=%u/%u blocks=%u dirty=%zu active=%zu journal@(%llu,%u)%s", ref_counts->free_blocks, ref_counts->block_count, diff --git a/vdo/ref-counts.h b/vdo/ref-counts.h new file mode 100644 index 00000000..520a92f0 --- /dev/null +++ b/vdo/ref-counts.h @@ -0,0 +1,193 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* * Copyright Red Hat + */ + +#ifndef REF_COUNTS_H +#define REF_COUNTS_H + +#include "block-allocator.h" +#include "completion.h" +#include "journal-point.h" +#include "packed-reference-block.h" +#include "slab.h" +#include "types.h" +#include "wait-queue.h" + +/* + * Represents the possible status of a block. + */ +enum reference_status { + RS_FREE, /* this block is free */ + RS_SINGLE, /* this block is singly-referenced */ + RS_SHARED, /* this block is shared */ + RS_PROVISIONAL /* this block is provisionally allocated */ +}; + +/* + * Reference_block structure + * + * Blocks are used as a proxy, permitting saves of partial refcounts. + */ +struct reference_block { + /* This block waits on the ref_counts to tell it to write */ + struct waiter waiter; + /* The parent ref_count structure */ + struct ref_counts *ref_counts; + /* The number of references in this block that represent allocations */ + block_size_t allocated_count; + /* The slab journal block on which this block must hold a lock */ + sequence_number_t slab_journal_lock; + /* + * The slab journal block which should be released when this block + * is committed + */ + sequence_number_t slab_journal_lock_to_release; + /* The point up to which each sector is accurate on disk */ + struct journal_point commit_points[VDO_SECTORS_PER_BLOCK]; + /* Whether this block has been modified since it was written to disk */ + bool is_dirty; + /* Whether this block is currently writing */ + bool is_writing; +}; + +/* + * The search_cursor represents the saved position of a free block search. + */ +struct search_cursor { + /* The reference block containing the current search index */ + struct reference_block *block; + /* + * The position at which to start searching for the next free counter + */ + slab_block_number index; + /* + * The position just past the last valid counter in the current block + */ + slab_block_number end_index; + + /* A pointer to the first reference block in the slab */ + struct reference_block *first_block; + /* A pointer to the last reference block in the slab */ + struct reference_block *last_block; +}; + +/* + * ref_counts structure + * + * A reference count is maintained for each physical block number. The vast + * majority of blocks have a very small reference count (usually 0 or 1). + * For references less than or equal to MAXIMUM_REFS (254) the reference count + * is stored in counters[pbn]. + * + */ +struct ref_counts { + /* The slab of this reference block */ + struct vdo_slab *slab; + + /* The size of the counters array */ + uint32_t block_count; + /* The number of free blocks */ + uint32_t free_blocks; + /* The array of reference counts */ + vdo_refcount_t *counters; /* use UDS_ALLOCATE to align data ptr */ + + /* + * The saved block pointer and array indexes for the free block search + */ + struct search_cursor search_cursor; + + /* A list of the dirty blocks waiting to be written out */ + struct wait_queue dirty_blocks; + /* The number of blocks which are currently writing */ + size_t active_count; + + /* A waiter object for updating the slab summary */ + struct waiter slab_summary_waiter; + /* Whether slab summary update is in progress */ + bool updating_slab_summary; + + /* The notifier for read-only mode */ + struct read_only_notifier *read_only_notifier; + /* + * The refcount statistics, shared by all refcounts in our physical + * zone + */ + struct ref_counts_statistics *statistics; + /* The layer PBN for the first struct reference_block */ + physical_block_number_t origin; + /* + * The latest slab journal entry this ref_counts has been updated with + */ + struct journal_point slab_journal_point; + + /* The number of reference count blocks */ + uint32_t reference_block_count; + /* reference count block array */ + struct reference_block blocks[]; +}; + +int __must_check +vdo_make_ref_counts(block_count_t block_count, + struct vdo_slab *slab, + physical_block_number_t origin, + struct read_only_notifier *read_only_notifier, + struct ref_counts **ref_counts_ptr); + +void vdo_free_ref_counts(struct ref_counts *ref_counts); + +bool __must_check vdo_are_ref_counts_active(struct ref_counts *ref_counts); + +void vdo_reset_search_cursor(struct ref_counts *ref_counts); + +block_count_t __must_check +vdo_get_unreferenced_block_count(struct ref_counts *ref_counts); + +uint8_t __must_check +vdo_get_available_references(struct ref_counts *ref_counts, + physical_block_number_t pbn); + +int __must_check +vdo_adjust_reference_count(struct ref_counts *ref_counts, + struct reference_operation operation, + const struct journal_point *slab_journal_point, + bool *free_status_changed); + +int __must_check +vdo_adjust_reference_count_for_rebuild(struct ref_counts *ref_counts, + physical_block_number_t pbn, + enum journal_operation operation); + +int __must_check +vdo_replay_reference_count_change(struct ref_counts *ref_counts, + const struct journal_point *entry_point, + struct slab_journal_entry entry); + +int __must_check +vdo_allocate_unreferenced_block(struct ref_counts *ref_counts, + physical_block_number_t *allocated_ptr); + +int __must_check +vdo_provisionally_reference_block(struct ref_counts *ref_counts, + physical_block_number_t pbn, + struct pbn_lock *lock); + +block_count_t __must_check +vdo_count_unreferenced_blocks(struct ref_counts *ref_counts, + physical_block_number_t start_pbn, + physical_block_number_t end_pbn); + +void vdo_save_several_reference_blocks(struct ref_counts *ref_counts, + size_t flush_divisor); + +void vdo_save_dirty_reference_blocks(struct ref_counts *ref_counts); + +void vdo_dirty_all_reference_blocks(struct ref_counts *ref_counts); + +void vdo_drain_ref_counts(struct ref_counts *ref_counts); + +void vdo_acquire_dirty_block_locks(struct ref_counts *ref_counts); + +void vdo_dump_ref_counts(const struct ref_counts *ref_counts); + + +#endif /* REF_COUNTS_H */ diff --git a/vdo/refCounts.h b/vdo/refCounts.h deleted file mode 100644 index ca556598..00000000 --- a/vdo/refCounts.h +++ /dev/null @@ -1,248 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/refCounts.h#10 $ - */ - -#ifndef REF_COUNTS_H -#define REF_COUNTS_H - -#include "completion.h" -#include "journalPoint.h" -#include "slab.h" -#include "types.h" - -/** - * Create a reference counting object. - * - *

A reference counting object can keep a reference count for every physical - * block in the VDO configuration. Since we expect the vast majority of the - * blocks to have 0 or 1 reference counts, the structure is optimized for that - * situation. - * - * @param [in] block_count The number of physical blocks that can be - * referenced - * @param [in] slab The slab of the ref counts object - * @param [in] origin The layer PBN at which to save ref_counts - * @param [in] read_only_notifier The context for tracking read-only mode - * @param [out] ref_counts_ptr The pointer to hold the new ref counts object - * - * @return a success or error code - **/ -int __must_check -make_vdo_ref_counts(block_count_t block_count, - struct vdo_slab *slab, - physical_block_number_t origin, - struct read_only_notifier *read_only_notifier, - struct ref_counts **ref_counts_ptr); - -/** - * Free a reference counting object. - * - * @param ref_counts The object to free - **/ -void free_vdo_ref_counts(struct ref_counts *ref_counts); - -/** - * Check whether a ref_counts is active. - * - * @param ref_counts The ref_counts to check - **/ -bool __must_check are_vdo_ref_counts_active(struct ref_counts *ref_counts); - -/** - * Reset the free block search back to the first reference counter - * in the first reference block. - * - * @param ref_counts The ref_counts object containing the search cursor - **/ -void vdo_reset_search_cursor(struct ref_counts *ref_counts); - -/** - * Get the stored count of the number of blocks that are currently free. - * - * @param ref_counts The ref_counts object - * - * @return the number of blocks with a reference count of zero - **/ -block_count_t __must_check -vdo_get_unreferenced_block_count(struct ref_counts *ref_counts); - -/** - * Determine how many times a reference count can be incremented without - * overflowing. - * - * @param ref_counts The ref_counts object - * @param pbn The physical block number - * - * @return the number of increments that can be performed - **/ -uint8_t __must_check -vdo_get_available_references(struct ref_counts *ref_counts, - physical_block_number_t pbn); - -/** - * Adjust the reference count of a block. - * - * @param [in] ref_counts The refcounts object - * @param [in] operation The operation to perform - * @param [in] slab_journal_point The slab journal entry for this adjustment - * @param [out] free_status_changed A pointer which will be set to true if the - * free status of the block changed - * - * - * @return A success or error code, specifically: - * VDO_REF_COUNT_INVALID if a decrement would result in a negative - * reference count, or an increment in a - * count greater than MAXIMUM_REFS - * - **/ -int __must_check -vdo_adjust_reference_count(struct ref_counts *ref_counts, - struct reference_operation operation, - const struct journal_point *slab_journal_point, - bool *free_status_changed); - -/** - * Adjust the reference count of a block during rebuild. - * - * @param ref_counts The refcounts object - * @param pbn The number of the block to adjust - * @param operation The operation to perform on the count - * - * @return VDO_SUCCESS or an error - **/ -int __must_check -vdo_adjust_reference_count_for_rebuild(struct ref_counts *ref_counts, - physical_block_number_t pbn, - enum journal_operation operation); - -/** - * Replay the reference count adjustment from a slab journal entry into the - * reference count for a block. The adjustment will be ignored if it was - * already recorded in the reference count. - * - * @param ref_counts The refcounts object - * @param entry_point The slab journal point for the entry - * @param entry The slab journal entry being replayed - * - * @return VDO_SUCCESS or an error code - **/ -int __must_check -vdo_replay_reference_count_change(struct ref_counts *ref_counts, - const struct journal_point *entry_point, - struct slab_journal_entry entry); - -/** - * Find a block with a reference count of zero in the range of physical block - * numbers tracked by the reference counter. If a free block is found, that - * block is allocated by marking it as provisionally referenced, and the - * allocated block number is returned. - * - * @param [in] ref_counts The reference counters to scan - * @param [out] allocated_ptr A pointer to hold the physical block number of - * the block that was found and allocated - * - * @return VDO_SUCCESS if a free block was found and allocated; - * VDO_NO_SPACE if there are no unreferenced blocks; - * otherwise an error code - **/ -int __must_check -vdo_allocate_unreferenced_block(struct ref_counts *ref_counts, - physical_block_number_t *allocated_ptr); - -/** - * Provisionally reference a block if it is unreferenced. - * - * @param ref_counts The reference counters - * @param pbn The PBN to reference - * @param lock The pbn_lock on the block (may be NULL) - * - * @return VDO_SUCCESS or an error - **/ -int __must_check -vdo_provisionally_reference_block(struct ref_counts *ref_counts, - physical_block_number_t pbn, - struct pbn_lock *lock); - -/** - * Count all unreferenced blocks in a range [start_block, end_block) of - * physical block numbers. - * - * @param ref_counts The reference counters to scan - * @param start_pbn The physical block number at which to start - * scanning (included in the scan) - * @param end_pbn The physical block number at which to stop - * scanning (excluded from the scan) - * - * @return The number of unreferenced blocks - **/ -block_count_t __must_check -vdo_count_unreferenced_blocks(struct ref_counts *ref_counts, - physical_block_number_t start_pbn, - physical_block_number_t end_pbn); - -/** - * Request a ref_counts object save several dirty blocks asynchronously. This - * function currently writes 1 / flush_divisor of the dirty blocks. - * - * @param ref_counts The ref_counts object to notify - * @param flush_divisor The inverse fraction of the dirty blocks to write - **/ -void vdo_save_several_reference_blocks(struct ref_counts *ref_counts, - size_t flush_divisor); - -/** - * Ask a ref_counts object to save all its dirty blocks asynchronously. - * - * @param ref_counts The ref_counts object to notify - **/ -void vdo_save_dirty_reference_blocks(struct ref_counts *ref_counts); - -/** - * Mark all reference count blocks as dirty. - * - * @param ref_counts The ref_counts of the reference blocks - **/ -void vdo_dirty_all_reference_blocks(struct ref_counts *ref_counts); - -/** - * Drain all reference count I/O. Depending upon the type of drain being - * performed (as recorded in the ref_count's vdo_slab), the reference blocks - * may be loaded from disk or dirty reference blocks may be written out. - * - * @param ref_counts The reference counts to drain - **/ -void drain_vdo_ref_counts(struct ref_counts *ref_counts); - -/** - * Mark all reference count blocks dirty and cause them to hold locks on slab - * journal block 1. - * - * @param ref_counts The ref_counts of the reference blocks - **/ -void vdo_acquire_dirty_block_locks(struct ref_counts *ref_counts); - -/** - * Dump information about this ref_counts structure. - * - * @param ref_counts The ref_counts to dump - **/ -void dump_vdo_ref_counts(const struct ref_counts *ref_counts); - -#endif // REF_COUNTS_H diff --git a/vdo/refCountsInternals.h b/vdo/refCountsInternals.h deleted file mode 100644 index b76f2e16..00000000 --- a/vdo/refCountsInternals.h +++ /dev/null @@ -1,157 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/refCountsInternals.h#6 $ - */ - -#ifndef REF_COUNTS_INTERNALS_H -#define REF_COUNTS_INTERNALS_H - -#include "refCounts.h" - -#include "journalPoint.h" -#include "packedReferenceBlock.h" -#include "slab.h" -#include "blockAllocatorInternals.h" -#include "waitQueue.h" - -/** - * Represents the possible status of a block. - **/ -enum reference_status { - RS_FREE, // this block is free - RS_SINGLE, // this block is singly-referenced - RS_SHARED, // this block is shared - RS_PROVISIONAL // this block is provisionally allocated -}; - -/* - * Reference_block structure - * - * Blocks are used as a proxy, permitting saves of partial refcounts. - **/ -struct reference_block { - /** This block waits on the ref_counts to tell it to write */ - struct waiter waiter; - /** The parent ref_count structure */ - struct ref_counts *ref_counts; - /** The number of references in this block that represent allocations */ - block_size_t allocated_count; - /** The slab journal block on which this block must hold a lock */ - sequence_number_t slab_journal_lock; - /** - * The slab journal block which should be released when this block - * is committed - **/ - sequence_number_t slab_journal_lock_to_release; - /** The point up to which each sector is accurate on disk */ - struct journal_point commit_points[VDO_SECTORS_PER_BLOCK]; - /** Whether this block has been modified since it was written to disk */ - bool is_dirty; - /** Whether this block is currently writing */ - bool is_writing; -}; - -/** - * The search_cursor represents the saved position of a free block search. - **/ -struct search_cursor { - /** The reference block containing the current search index */ - struct reference_block *block; - /** - * The position at which to start searching for the next free counter - */ - slab_block_number index; - /** - * The position just past the last valid counter in the current block - */ - slab_block_number end_index; - - /** A pointer to the first reference block in the slab */ - struct reference_block *first_block; - /** A pointer to the last reference block in the slab */ - struct reference_block *last_block; -}; - -/* - * ref_counts structure - * - * A reference count is maintained for each physical block number. The vast - * majority of blocks have a very small reference count (usually 0 or 1). - * For references less than or equal to MAXIMUM_REFS (254) the reference count - * is stored in counters[pbn]. - * - */ -struct ref_counts { - /** The slab of this reference block */ - struct vdo_slab *slab; - - /** The size of the counters array */ - uint32_t block_count; - /** The number of free blocks */ - uint32_t free_blocks; - /** The array of reference counts */ - vdo_refcount_t *counters; // use UDS_ALLOCATE to align data ptr - - /** - * The saved block pointer and array indexes for the free block search - */ - struct search_cursor search_cursor; - - /** A list of the dirty blocks waiting to be written out */ - struct wait_queue dirty_blocks; - /** The number of blocks which are currently writing */ - size_t active_count; - - /** A waiter object for updating the slab summary */ - struct waiter slab_summary_waiter; - /** Whether slab summary update is in progress */ - bool updating_slab_summary; - - /** The notifier for read-only mode */ - struct read_only_notifier *read_only_notifier; - /** - * The refcount statistics, shared by all refcounts in our physical - * zone - */ - struct ref_counts_statistics *statistics; - /** The layer PBN for the first struct reference_block */ - physical_block_number_t origin; - /** - * The latest slab journal entry this ref_counts has been updated with - */ - struct journal_point slab_journal_point; - - /** The number of reference count blocks */ - uint32_t reference_block_count; - /** reference count block array */ - struct reference_block blocks[]; -}; - -/** - * Convert a generic vdo_completion to a ref_counts object. - * - * @param completion The completion to convert - * - * @return The completion as a ref_counts object - **/ -struct ref_counts * __must_check -as_vdo_ref_counts(struct vdo_completion *completion); - - -#endif // REF_COUNTS_INTERNALS_H diff --git a/vdo/referenceCountRebuild.c b/vdo/reference-count-rebuild.c similarity index 52% rename from vdo/referenceCountRebuild.c rename to vdo/reference-count-rebuild.c index 67c42928..70cdff89 100644 --- a/vdo/referenceCountRebuild.c +++ b/vdo/reference-count-rebuild.c @@ -1,133 +1,115 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/referenceCountRebuild.c#37 $ */ -#include "referenceCountRebuild.h" +#include "reference-count-rebuild.h" #include "logger.h" -#include "memoryAlloc.h" +#include "memory-alloc.h" #include "permassert.h" -#include "blockMap.h" -#include "blockMapInternals.h" -#include "blockMapPage.h" +#include "block-map.h" +#include "block-map-page.h" #include "forest.h" #include "constants.h" -#include "numUtils.h" -#include "refCounts.h" -#include "slabDepot.h" -#include "vdoInternal.h" -#include "vdoPageCache.h" +#include "num-utils.h" +#include "ref-counts.h" +#include "slab-depot.h" +#include "thread-config.h" +#include "vdo.h" +#include "vdo-page-cache.h" -/** +/* * A reference count rebuild completion. * Note that the page completions kept in this structure are not immediately * freed, so the corresponding pages will be locked down in the page cache * until the rebuild frees them. - **/ + */ struct rebuild_completion { - /** completion header */ + /* completion header */ struct vdo_completion completion; - /** the completion for flushing the block map */ + /* the completion for flushing the block map */ struct vdo_completion sub_task_completion; - /** the thread on which all block map operations must be done */ + /* the thread on which all block map operations must be done */ thread_id_t logical_thread_id; - /** the admin thread */ + /* the admin thread */ thread_id_t admin_thread_id; - /** the block map */ + /* the block map */ struct block_map *block_map; - /** the slab depot */ + /* the slab depot */ struct slab_depot *depot; - /** whether this recovery has been aborted */ + /* whether this recovery has been aborted */ bool aborted; - /** whether we are currently launching the initial round of requests */ + /* whether we are currently launching the initial round of requests */ bool launching; - /** The number of logical blocks observed used */ + /* The number of logical blocks observed used */ block_count_t *logical_blocks_used; - /** The number of block map data blocks */ + /* The number of block map data blocks */ block_count_t *block_map_data_blocks; - /** the next page to fetch */ + /* the next page to fetch */ page_count_t page_to_fetch; - /** the number of leaf pages in the block map */ + /* the number of leaf pages in the block map */ page_count_t leaf_pages; - /** the last slot of the block map */ + /* the last slot of the block map */ struct block_map_slot last_slot; - /** number of pending (non-ready) requests*/ + /* number of pending (non-ready) requests*/ page_count_t outstanding; - /** number of page completions */ + /* number of page completions */ page_count_t page_count; - /** array of requested, potentially ready page completions */ + /* array of requested, potentially ready page completions */ struct vdo_page_completion page_completions[]; }; /** - * Convert a vdo_completion to a rebuild_completion. + * as_rebuild_completion() - Convert a vdo_completion to a rebuild_completion. + * @completion: The completion to convert. * - * @param completion The completion to convert - * - * @return The completion as a rebuild_completion - **/ + * Return: The completion as a rebuild_completion. + */ static inline struct rebuild_completion * __must_check as_rebuild_completion(struct vdo_completion *completion) { - assert_vdo_completion_type(completion->type, + vdo_assert_completion_type(completion->type, VDO_REFERENCE_COUNT_REBUILD_COMPLETION); return container_of(completion, struct rebuild_completion, completion); } /** - * Free the rebuild_completion and notify the parent that the block map - * rebuild is done. This callback is registered in make_rebuild_completion(). + * finish_rebuild() - Free the rebuild_completion and notify the parent that + * the block map rebuild is done. + * @completion: The rebuild_completion. * - * @param completion The rebuild_completion - **/ + * This callback is registered in make_rebuild_completion(). + */ static void finish_rebuild(struct vdo_completion *completion) { int result = completion->result; struct vdo_completion *parent = completion->parent; + UDS_FREE(UDS_FORGET(completion)); - finish_vdo_completion(parent, result); + vdo_finish_completion(parent, result); } /** - * Make a new rebuild completion. + * make_rebuild_completion() - Make a new rebuild completion. + * @vdo: The vdo. + * @logical_blocks_used: A pointer to hold the logical blocks used. + * @block_map_data_blocks: A pointer to hold the number of block map data + * blocks. + * @parent: The parent of the rebuild completion. + * @rebuild_ptr: The new block map rebuild completion. * - * @param [in] vdo The vdo - * @param [in] logical_blocks_used A pointer to hold the logical blocks used - * @param [in] block_map_data_blocks A pointer to hold the number of block map - * data blocks - * @param [in] parent The parent of the rebuild completion - * @param [out] rebuild_ptr The new block map rebuild completion - * - * @return a success or error code - **/ + * Return: A success or error code. + */ static int make_rebuild_completion(struct vdo *vdo, block_count_t *logical_blocks_used, block_count_t *block_map_data_blocks, struct vdo_completion *parent, struct rebuild_completion **rebuild_ptr) { - const struct thread_config *thread_config = get_vdo_thread_config(vdo); - struct block_map *block_map = get_block_map(vdo); page_count_t page_count = - min(get_vdo_configured_cache_size(vdo) >> 1, + min(vdo->device_config->cache_size >> 1, (page_count_t) MAXIMUM_SIMULTANEOUS_VDO_BLOCK_MAP_RESTORATION_READS); struct rebuild_completion *rebuild; @@ -140,32 +122,29 @@ static int make_rebuild_completion(struct vdo *vdo, return result; } - initialize_vdo_completion(&rebuild->completion, vdo, + vdo_initialize_completion(&rebuild->completion, vdo, VDO_REFERENCE_COUNT_REBUILD_COMPLETION); - initialize_vdo_completion(&rebuild->sub_task_completion, vdo, + vdo_initialize_completion(&rebuild->sub_task_completion, vdo, VDO_SUB_TASK_COMPLETION); - if (result != VDO_SUCCESS) { - UDS_FREE(UDS_FORGET(rebuild)); - return result; - } - rebuild->block_map = block_map; + rebuild->block_map = vdo->block_map; rebuild->depot = vdo->depot; rebuild->logical_blocks_used = logical_blocks_used; rebuild->block_map_data_blocks = block_map_data_blocks; rebuild->page_count = page_count; rebuild->leaf_pages = - compute_vdo_block_map_page_count(block_map->entry_count); + vdo_compute_block_map_page_count(vdo->block_map->entry_count); - rebuild->logical_thread_id = vdo_get_logical_zone_thread(thread_config, 0); - rebuild->admin_thread_id = thread_config->admin_thread; + rebuild->logical_thread_id = + vdo_get_logical_zone_thread(vdo->thread_config, 0); + rebuild->admin_thread_id = vdo->thread_config->admin_thread; ASSERT_LOG_ONLY((vdo_get_callback_thread_id() == rebuild->logical_thread_id), "%s must be called on logical thread %u (not %u)", __func__, rebuild->logical_thread_id, vdo_get_callback_thread_id()); - prepare_vdo_completion(&rebuild->completion, finish_rebuild, + vdo_prepare_completion(&rebuild->completion, finish_rebuild, finish_rebuild, rebuild->logical_thread_id, parent); @@ -174,28 +153,29 @@ static int make_rebuild_completion(struct vdo *vdo, } /** - * Flush the block map now that all the reference counts are rebuilt. This - * callback is registered in finish_if_done(). + * flush_block_map_updates() - Flush the block map now that all the reference + * counts are rebuilt. + * @completion: The sub-task completion. * - * @param completion The sub-task completion - **/ + * This callback is registered in finish_if_done(). + */ static void flush_block_map_updates(struct vdo_completion *completion) { uds_log_info("Flushing block map changes"); - prepare_vdo_completion_to_finish_parent(completion, completion->parent); - drain_vdo_block_map(as_rebuild_completion(completion->parent)->block_map, + vdo_prepare_completion_to_finish_parent(completion, completion->parent); + vdo_drain_block_map(as_rebuild_completion(completion->parent)->block_map, VDO_ADMIN_STATE_RECOVERING, completion); } /** - * Check whether the rebuild is done. If it succeeded, continue by flushing the - * block map. + * finish_if_done() - Check whether the rebuild is done. + * @rebuild: The rebuild completion. * - * @param rebuild The rebuild completion + * If it succeeded, continues by flushing the block map. * - * @return true if the rebuild is complete - **/ + * Return: true if the rebuild is complete. + */ static bool finish_if_done(struct rebuild_completion *rebuild) { if (rebuild->launching || (rebuild->outstanding > 0)) { @@ -203,7 +183,7 @@ static bool finish_if_done(struct rebuild_completion *rebuild) } if (rebuild->aborted) { - complete_vdo_completion(&rebuild->completion); + vdo_complete_completion(&rebuild->completion); return true; } @@ -211,93 +191,95 @@ static bool finish_if_done(struct rebuild_completion *rebuild) return false; } - prepare_vdo_completion(&rebuild->sub_task_completion, + vdo_prepare_completion(&rebuild->sub_task_completion, flush_block_map_updates, - finish_vdo_completion_parent_callback, + vdo_finish_completion_parent_callback, rebuild->admin_thread_id, &rebuild->completion); - invoke_vdo_completion_callback(&rebuild->sub_task_completion); + vdo_invoke_completion_callback(&rebuild->sub_task_completion); return true; } /** - * Record that there has been an error during the rebuild. - * - * @param rebuild The rebuild completion - * @param result The error result to use, if one is not already saved - **/ + * abort_rebuild() - Record that there has been an error during the rebuild. + * @rebuild: The rebuild completion. + * @result: The error result to use, if one is not already saved. + */ static void abort_rebuild(struct rebuild_completion *rebuild, int result) { rebuild->aborted = true; - set_vdo_completion_result(&rebuild->completion, result); + vdo_set_completion_result(&rebuild->completion, result); } /** - * Handle an error loading a page. - * - * @param completion The vdo_page_completion - **/ + * handle_page_load_error() - Handle an error loading a page. + * @completion: The vdo_page_completion. + */ static void handle_page_load_error(struct vdo_completion *completion) { struct rebuild_completion *rebuild = as_rebuild_completion(completion->parent); rebuild->outstanding--; abort_rebuild(rebuild, completion->result); - release_vdo_page_completion(completion); + vdo_release_page_completion(completion); finish_if_done(rebuild); } /** - * Rebuild reference counts from a block map page. + * rebuild_reference_counts_from_page() - Rebuild reference counts from a + * block map page. + * @rebuild: The rebuild completion. + * @completion: The page completion holding the page. * - * @param rebuild The rebuild completion - * @param completion The page completion holding the page - * - * @return VDO_SUCCESS or an error - **/ + * Return: VDO_SUCCESS or an error. + */ static int rebuild_reference_counts_from_page(struct rebuild_completion *rebuild, struct vdo_completion *completion) { slot_number_t slot; - struct block_map_page *page = dereference_writable_vdo_page(completion); + struct block_map_page *page = vdo_dereference_writable_page(completion); int result = ASSERT(page != NULL, "page available"); + if (result != VDO_SUCCESS) { return result; } - if (!is_vdo_block_map_page_initialized(page)) { + if (!vdo_is_block_map_page_initialized(page)) { return VDO_SUCCESS; } - // Remove any bogus entries which exist beyond the end of the logical - // space. - if (get_vdo_block_map_page_pbn(page) == rebuild->last_slot.pbn) { + /* + * Remove any bogus entries which exist beyond the end of the logical + * space. + */ + if (vdo_get_block_map_page_pbn(page) == rebuild->last_slot.pbn) { slot_number_t slot; + for (slot = rebuild->last_slot.slot; slot < VDO_BLOCK_MAP_ENTRIES_PER_PAGE; slot++) { struct data_location mapping = - unpack_vdo_block_map_entry(&page->entries[slot]); + vdo_unpack_block_map_entry(&page->entries[slot]); if (vdo_is_mapped_location(&mapping)) { page->entries[slot] = - pack_vdo_pbn(VDO_ZERO_BLOCK, + vdo_pack_pbn(VDO_ZERO_BLOCK, VDO_MAPPING_STATE_UNMAPPED); - request_vdo_page_write(completion); + vdo_request_page_write(completion); } } } - // Inform the slab depot of all entries on this page. + /* Inform the slab depot of all entries on this page. */ for (slot = 0; slot < VDO_BLOCK_MAP_ENTRIES_PER_PAGE; slot++) { struct vdo_slab *slab; int result; struct data_location mapping = - unpack_vdo_block_map_entry(&page->entries[slot]); + vdo_unpack_block_map_entry(&page->entries[slot]); if (!vdo_is_valid_location(&mapping)) { - // This entry is invalid, so remove it from the page. - page->entries[slot] = pack_vdo_pbn(VDO_ZERO_BLOCK, + /* This entry is invalid, so remove it from the page. */ + page->entries[slot] = vdo_pack_pbn(VDO_ZERO_BLOCK, VDO_MAPPING_STATE_UNMAPPED); - request_vdo_page_write(completion); + vdo_request_page_write(completion); continue; } @@ -311,42 +293,43 @@ rebuild_reference_counts_from_page(struct rebuild_completion *rebuild, } if (!vdo_is_physical_data_block(rebuild->depot, mapping.pbn)) { - // This is a nonsense mapping. Remove it from the map so - // we're at least consistent and mark the page dirty. - page->entries[slot] = pack_vdo_pbn(VDO_ZERO_BLOCK, + /* + * This is a nonsense mapping. Remove it from the map so + * we're at least consistent and mark the page dirty. + */ + page->entries[slot] = vdo_pack_pbn(VDO_ZERO_BLOCK, VDO_MAPPING_STATE_UNMAPPED); - request_vdo_page_write(completion); + vdo_request_page_write(completion); continue; } - slab = get_vdo_slab(rebuild->depot, mapping.pbn); + slab = vdo_get_slab(rebuild->depot, mapping.pbn); result = vdo_adjust_reference_count_for_rebuild(slab->reference_counts, mapping.pbn, - DATA_INCREMENT); + VDO_JOURNAL_DATA_INCREMENT); if (result != VDO_SUCCESS) { uds_log_error_strerror(result, "Could not adjust reference count for PBN %llu, slot %u mapped to PBN %llu", - (unsigned long long) get_vdo_block_map_page_pbn(page), + (unsigned long long) vdo_get_block_map_page_pbn(page), slot, (unsigned long long) mapping.pbn); - page->entries[slot] = pack_vdo_pbn(VDO_ZERO_BLOCK, + page->entries[slot] = vdo_pack_pbn(VDO_ZERO_BLOCK, VDO_MAPPING_STATE_UNMAPPED); - request_vdo_page_write(completion); + vdo_request_page_write(completion); } } return VDO_SUCCESS; } -/**********************************************************************/ static void fetch_page(struct rebuild_completion *rebuild, struct vdo_completion *completion); /** - * Process a page which has just been loaded. This callback is registered by - * fetch_page(). + * page_loaded() - Process a page which has just been loaded. + * @completion: The vdo_page_completion for the fetched page. * - * @param completion The vdo_page_completion for the fetched page - **/ + * This callback is registered by fetch_page(). + */ static void page_loaded(struct vdo_completion *completion) { int result; @@ -360,22 +343,23 @@ static void page_loaded(struct vdo_completion *completion) abort_rebuild(rebuild, result); } - release_vdo_page_completion(completion); + vdo_release_page_completion(completion); if (finish_if_done(rebuild)) { return; } - // Advance progress to the next page, and fetch the next page we - // haven't yet requested. + /* + * Advance progress to the next page, and fetch the next page we + * haven't yet requested. + */ fetch_page(rebuild, completion); } /** - * Fetch a page from the block map. - * - * @param rebuild the rebuild_completion - * @param completion the page completion to use - **/ + * fetch_page() - Fetch a page from the block map. + * @rebuild: The rebuild_completion. + * @completion: The page completion to use. + */ static void fetch_page(struct rebuild_completion *rebuild, struct vdo_completion *completion) { @@ -395,25 +379,27 @@ static void fetch_page(struct rebuild_completion *rebuild, continue; } - init_vdo_page_completion(((struct vdo_page_completion *) completion), + vdo_init_page_completion(((struct vdo_page_completion *) completion), rebuild->block_map->zones[0].page_cache, pbn, true, &rebuild->completion, page_loaded, handle_page_load_error); rebuild->outstanding++; - get_vdo_page(completion); + vdo_get_page(completion); return; } } /** - * Rebuild reference counts from the leaf block map pages now that reference + * rebuild_from_leaves() - Rebuild reference counts from the leaf block map + * pages. + * @completion: The sub-task completion. + * + * Rebuilds reference counts from the leaf block map pages now that reference * counts have been rebuilt from the interior tree pages (which have been * loaded in the process). This callback is registered in - * rebuild_vdo_reference_counts(). - * - * @param completion The sub-task completion - **/ + * vdo_rebuild_reference_counts(). + */ static void rebuild_from_leaves(struct vdo_completion *completion) { page_count_t i; @@ -421,8 +407,10 @@ static void rebuild_from_leaves(struct vdo_completion *completion) as_rebuild_completion(completion->parent); *rebuild->logical_blocks_used = 0; - // The PBN calculation doesn't work until the tree pages have been - // loaded, so we can't set this value at the start of rebuild. + /* + * The PBN calculation doesn't work until the tree pages have been + * loaded, so we can't set this value at the start of rebuild. + */ rebuild->last_slot = (struct block_map_slot){ .slot = rebuild->block_map->entry_count % VDO_BLOCK_MAP_ENTRIES_PER_PAGE, @@ -430,8 +418,10 @@ static void rebuild_from_leaves(struct vdo_completion *completion) rebuild->leaf_pages - 1), }; - // Prevent any page from being processed until all pages have been - // launched. + /* + * Prevent any page from being processed until all pages have been + * launched. + */ rebuild->launching = true; for (i = 0; i < rebuild->page_count; i++) { fetch_page(rebuild, &rebuild->page_completions[i].completion); @@ -441,15 +431,14 @@ static void rebuild_from_leaves(struct vdo_completion *completion) } /** - * Process a single entry from the block map tree. + * process_entry() - Process a single entry from the block map tree. + * @pbn: A pbn which holds a block map tree page. + * @completion: The parent completion of the traversal. * - *

Implements vdo_entry_callback. + * Implements vdo_entry_callback. * - * @param pbn A pbn which holds a block map tree page - * @param completion The parent completion of the traversal - * - * @return VDO_SUCCESS or an error - **/ + * Return: VDO_SUCCESS or an error. + */ static int process_entry(physical_block_number_t pbn, struct vdo_completion *completion) { @@ -465,10 +454,10 @@ static int process_entry(physical_block_number_t pbn, (unsigned long long) pbn); } - slab = get_vdo_slab(rebuild->depot, pbn); + slab = vdo_get_slab(rebuild->depot, pbn); result = vdo_adjust_reference_count_for_rebuild(slab->reference_counts, - pbn, - BLOCK_MAP_INCREMENT); + pbn, + VDO_JOURNAL_BLOCK_MAP_INCREMENT); if (result != VDO_SUCCESS) { return uds_log_error_strerror(result, "Could not adjust reference count for block map tree PBN %llu", @@ -479,8 +468,16 @@ static int process_entry(physical_block_number_t pbn, return VDO_SUCCESS; } -/**********************************************************************/ -void rebuild_vdo_reference_counts(struct vdo *vdo, +/** + * vdo_rebuild_reference_counts() - Rebuild the reference counts from the + * block map (read-only rebuild). + * @vdo: The vdo. + * @parent: The completion to notify when the rebuild is complete + * @logical_blocks_used: A pointer to hold the logical blocks used. + * @block_map_data_blocks: A pointer to hold the number of block map + * data blocks. + */ +void vdo_rebuild_reference_counts(struct vdo *vdo, struct vdo_completion *parent, block_count_t *logical_blocks_used, block_count_t *block_map_data_blocks) @@ -492,25 +489,27 @@ void rebuild_vdo_reference_counts(struct vdo *vdo, block_map_data_blocks, parent, &rebuild); if (result != VDO_SUCCESS) { - finish_vdo_completion(parent, result); + vdo_finish_completion(parent, result); return; } - // Completion chaining from page cache hits can lead to stack overflow - // during the rebuild, so clear out the cache before this rebuild phase. + /* + * Completion chaining from page cache hits can lead to stack overflow + * during the rebuild, so clear out the cache before this rebuild phase. + */ result = - invalidate_vdo_page_cache(rebuild->block_map->zones[0].page_cache); + vdo_invalidate_page_cache(rebuild->block_map->zones[0].page_cache); if (result != VDO_SUCCESS) { - finish_vdo_completion(parent, result); + vdo_finish_completion(parent, result); return; } - // First traverse the block map trees. + /* First traverse the block map trees. */ *rebuild->block_map_data_blocks = 0; completion = &rebuild->sub_task_completion; - prepare_vdo_completion(completion, rebuild_from_leaves, - finish_vdo_completion_parent_callback, + vdo_prepare_completion(completion, rebuild_from_leaves, + vdo_finish_completion_parent_callback, rebuild->logical_thread_id, &rebuild->completion); - traverse_vdo_forest(rebuild->block_map, process_entry, completion); + vdo_traverse_forest(rebuild->block_map, process_entry, completion); } diff --git a/vdo/reference-count-rebuild.h b/vdo/reference-count-rebuild.h new file mode 100644 index 00000000..31109209 --- /dev/null +++ b/vdo/reference-count-rebuild.h @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright Red Hat + */ + +#ifndef REFERENCE_COUNT_REBUILD_H +#define REFERENCE_COUNT_REBUILD_H + +#include "kernel-types.h" +#include "types.h" + +void vdo_rebuild_reference_counts(struct vdo *vdo, + struct vdo_completion *parent, + block_count_t *logical_blocks_used, + block_count_t *block_map_data_blocks); + +#endif /* REFERENCE_COUNT_REBUILD_H */ diff --git a/vdo/reference-operation.c b/vdo/reference-operation.c new file mode 100644 index 00000000..334ada32 --- /dev/null +++ b/vdo/reference-operation.c @@ -0,0 +1,75 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright Red Hat + */ + +#include "reference-operation.h" + +#include "physical-zone.h" +#include "types.h" + +static struct pbn_lock *return_pbn_lock(struct reference_operation operation) +{ + return (struct pbn_lock *) operation.context; +} + +/** + * vdo_set_up_reference_operation_with_lock() - Set up a reference_operation + * for which we already have the + * lock. + * @type: The type of operation. + * @pbn: The PBN of the block on which to operate. + * @state: The mapping state of the block on which to operate. + * @lock: The pbn_lock to associate with the operation. + * @operation: The reference_operation to set up. + */ +void +vdo_set_up_reference_operation_with_lock(enum journal_operation type, + physical_block_number_t pbn, + enum block_mapping_state state, + struct pbn_lock *lock, + struct reference_operation *operation) +{ + *operation = (struct reference_operation) { + .type = type, + .pbn = pbn, + .state = state, + .lock_getter = return_pbn_lock, + .context = lock, + }; +} + +static struct pbn_lock *look_up_pbn_lock(struct reference_operation operation) +{ + return ((operation.context == NULL) + ? NULL + : vdo_get_physical_zone_pbn_lock(operation.context, + operation.pbn)); +} + +/** + * vdo_set_up_reference_operation_with_zone() - Set up a reference_operation + * for which we will need to look + * up the lock later. + * @type: The type of operation. + * @pbn: The PBN of the block on which to operate. + * @state: The mapping state of the block on which to operate. + * @zone: The physical_zone from which the pbn_lock can be retrieved when + * needed. + * @operation: The reference_operation to set up. + */ +void +vdo_set_up_reference_operation_with_zone(enum journal_operation type, + physical_block_number_t pbn, + enum block_mapping_state state, + struct physical_zone *zone, + struct reference_operation *operation) +{ + *operation = (struct reference_operation) { + .type = type, + .pbn = pbn, + .state = state, + .lock_getter = look_up_pbn_lock, + .context = zone, + }; +} diff --git a/vdo/reference-operation.h b/vdo/reference-operation.h new file mode 100644 index 00000000..a2defefe --- /dev/null +++ b/vdo/reference-operation.h @@ -0,0 +1,73 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright Red Hat + */ + +#ifndef REFERENCE_OPERATION_H +#define REFERENCE_OPERATION_H + +#include "kernel-types.h" +#include "types.h" + +struct reference_operation; + +/** + * typedef pbn_lock_getter - Get the pbn_lock associated with a + * reference_operation. + * @operation: The reference_operation. + * + * Return: The pbn_lock on the block of a reference_operation or NULL if there + * isn't one. + */ +typedef struct pbn_lock *pbn_lock_getter(struct reference_operation operation); + +/* + * The current operation on a physical block (from the point of view of the + * data_vio doing the operation) + */ +struct reference_operation { + /* The operation being performed */ + enum journal_operation type; + /* The PBN of the block being operated on */ + physical_block_number_t pbn; + /* The mapping state of the block being operated on */ + enum block_mapping_state state; + /* + * A function to use to get any pbn_lock associated with this operation + */ + pbn_lock_getter *lock_getter; + /* The context to pass to the pbn_lock_getter */ + void *context; +}; + +/** + * vdo_get_reference_operation_pbn_lock() - Get the pbn_lock associated with + * the current reference_operation. + * @operation: The reference operation. + * + * Return: The pbn_lock on the block of the current operation or NULL if there + * isn't one. + */ +static inline struct pbn_lock * __must_check +vdo_get_reference_operation_pbn_lock(struct reference_operation operation) +{ + return ((operation.lock_getter == NULL) + ? NULL + : operation.lock_getter(operation)); +} + +void +vdo_set_up_reference_operation_with_lock(enum journal_operation type, + physical_block_number_t pbn, + enum block_mapping_state state, + struct pbn_lock *lock, + struct reference_operation *operation); + +void +vdo_set_up_reference_operation_with_zone(enum journal_operation type, + physical_block_number_t pbn, + enum block_mapping_state state, + struct physical_zone *zone, + struct reference_operation *operation); + +#endif /* REFERENCE_OPERATION_H */ diff --git a/vdo/referenceCountRebuild.h b/vdo/referenceCountRebuild.h deleted file mode 100644 index 45317bc8..00000000 --- a/vdo/referenceCountRebuild.h +++ /dev/null @@ -1,42 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/referenceCountRebuild.h#8 $ - */ - -#ifndef REFERENCE_COUNT_REBUILD_H -#define REFERENCE_COUNT_REBUILD_H - -#include "types.h" - -/** - * Rebuild the reference counts from the block map (read-only rebuild). - * - * @param [in] vdo The vdo - * @param [in] parent The completion to notify when the - * rebuild is complete - * @param [out] logical_blocks_used A pointer to hold the logical blocks used - * @param [out] block_map_data_blocks A pointer to hold the number of block map - * data blocks - **/ -void rebuild_vdo_reference_counts(struct vdo *vdo, - struct vdo_completion *parent, - block_count_t *logical_blocks_used, - block_count_t *block_map_data_blocks); - -#endif // REFERENCE_COUNT_REBUILD_H diff --git a/vdo/referenceOperation.c b/vdo/referenceOperation.c deleted file mode 100644 index 83fa0adc..00000000 --- a/vdo/referenceOperation.c +++ /dev/null @@ -1,74 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/referenceOperation.c#8 $ - */ - -#include "referenceOperation.h" - -#include "physicalZone.h" -#include "types.h" - -/**********************************************************************/ -static struct pbn_lock *return_pbn_lock(struct reference_operation operation) -{ - return (struct pbn_lock *) operation.context; -} - -/**********************************************************************/ -void -set_up_vdo_reference_operation_with_lock(enum journal_operation type, - physical_block_number_t pbn, - enum block_mapping_state state, - struct pbn_lock *lock, - struct reference_operation *operation) -{ - *operation = (struct reference_operation) { - .type = type, - .pbn = pbn, - .state = state, - .lock_getter = return_pbn_lock, - .context = lock, - }; -} - -/**********************************************************************/ -static struct pbn_lock *look_up_pbn_lock(struct reference_operation operation) -{ - return ((operation.context == NULL) - ? NULL - : get_vdo_physical_zone_pbn_lock(operation.context, - operation.pbn)); -} - -/**********************************************************************/ -void -set_up_vdo_reference_operation_with_zone(enum journal_operation type, - physical_block_number_t pbn, - enum block_mapping_state state, - struct physical_zone *zone, - struct reference_operation *operation) -{ - *operation = (struct reference_operation) { - .type = type, - .pbn = pbn, - .state = state, - .lock_getter = look_up_pbn_lock, - .context = zone, - }; -} diff --git a/vdo/referenceOperation.h b/vdo/referenceOperation.h deleted file mode 100644 index caf090ba..00000000 --- a/vdo/referenceOperation.h +++ /dev/null @@ -1,108 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/referenceOperation.h#9 $ - */ - -#ifndef REFERENCE_OPERATION_H -#define REFERENCE_OPERATION_H - -#include "types.h" - -struct reference_operation; - -/** - * Get the pbn_lock associated with a reference_operation. - * - * @param operation The reference_operation - * - * @return The pbn_lock on the block of a reference_operation or NULL if there - * isn't one - **/ -typedef struct pbn_lock *pbn_lock_getter(struct reference_operation operation); - -/** - * The current operation on a physical block (from the point of view of the - * data_vio doing the operation) - **/ -struct reference_operation { - /** The operation being performed */ - enum journal_operation type; - /** The PBN of the block being operated on */ - physical_block_number_t pbn; - /** The mapping state of the block being operated on */ - enum block_mapping_state state; - /** - * A function to use to get any pbn_lock associated with this operation - */ - pbn_lock_getter *lock_getter; - /** The context to pass to the pbn_lock_getter */ - void *context; -}; - -/** - * Get the pbn_lock associated with the current reference_operation. - * - * @param operation The reference operation - * - * @return The pbn_lock on the block of the current operation or NULL if there - * isn't one - **/ -static inline struct pbn_lock * __must_check -get_vdo_reference_operation_pbn_lock(struct reference_operation operation) -{ - return ((operation.lock_getter == NULL) - ? NULL - : operation.lock_getter(operation)); -} - -/** - * Set up a reference_operation for which we already have the lock. - * - * @param type The type of operation - * @param pbn The PBN of the block on which to operate - * @param state The mapping state of the block on which to operate - * @param lock The pbn_lock to associate with the operation - * @param operation The reference_operation to set up - **/ -void -set_up_vdo_reference_operation_with_lock(enum journal_operation type, - physical_block_number_t pbn, - enum block_mapping_state state, - struct pbn_lock *lock, - struct reference_operation *operation); - -/** - * Set up a reference_operation for which we will need to look up the lock - *later. - * - * @param type The type of operation - * @param pbn The PBN of the block on which to operate - * @param state The mapping state of the block on which to operate - * @param zone The physical_zone from which the pbn_lock can be retrieved - * when needed - * @param operation The reference_operation to set up - **/ -void -set_up_vdo_reference_operation_with_zone(enum journal_operation type, - physical_block_number_t pbn, - enum block_mapping_state state, - struct physical_zone *zone, - struct reference_operation *operation); - -#endif // REFERENCE_OPERATION_H diff --git a/vdo/releaseVersions.h b/vdo/release-versions.h similarity index 96% rename from vdo/releaseVersions.h rename to vdo/release-versions.h index 325108fa..1df8e8a2 100644 --- a/vdo/releaseVersions.h +++ b/vdo/release-versions.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright Red Hat * diff --git a/uds/requestQueueKernel.c b/vdo/request-queue.c similarity index 74% rename from uds/requestQueueKernel.c rename to vdo/request-queue.c index 2c56754e..cac849ef 100644 --- a/uds/requestQueueKernel.c +++ b/vdo/request-queue.c @@ -1,35 +1,18 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/krusty/kernelLinux/uds/requestQueueKernel.c#29 $ */ -#include "requestQueue.h" +#include "request-queue.h" #include #include #include "compiler.h" +#include "funnel-queue.h" #include "logger.h" -#include "request.h" -#include "memoryAlloc.h" +#include "memory-alloc.h" #include "uds-threads.h" -#include "util/funnelQueue.h" /* * Ordering: @@ -76,8 +59,8 @@ enum { * that have been processed since the worker thread last woke up. **/ enum { - MINIMUM_BATCH = 32, // wait time increases if batch smaller than this - MAXIMUM_BATCH = 64 // wait time decreases if batch larger than this + MINIMUM_BATCH = 32, /* wait time increases if batch smaller than this */ + MAXIMUM_BATCH = 64 /* wait time decreases if batch larger than this */ }; struct uds_request_queue { @@ -99,7 +82,6 @@ struct uds_request_queue { atomic_t dormant; }; -/**********************************************************************/ /** * Poll the underlying lock-free queues for a request to process. Must only be * called by the worker thread. @@ -110,24 +92,23 @@ struct uds_request_queue { **/ static INLINE struct uds_request *poll_queues(struct uds_request_queue *queue) { - // The retry queue has higher priority. + /* The retry queue has higher priority. */ struct funnel_queue_entry *entry = funnel_queue_poll(queue->retry_queue); if (entry != NULL) { return container_of(entry, struct uds_request, request_queue_link); } - // The main queue has lower priority. + /* The main queue has lower priority. */ entry = funnel_queue_poll(queue->main_queue); if (entry != NULL) { return container_of(entry, struct uds_request, request_queue_link); } - // No entry found. + /* No entry found. */ return NULL; } -/**********************************************************************/ /** * Check if the underlying lock-free queues appear not just not to have any * requests available right now, but also not to be in the intermediate state @@ -143,7 +124,6 @@ static INLINE bool are_queues_idle(struct uds_request_queue *queue) is_funnel_queue_idle(queue->main_queue)); } -/**********************************************************************/ /** * Remove the next request to be processed from the queue. Must only be called * by the worker thread. @@ -160,27 +140,27 @@ static INLINE bool dequeue_request(struct uds_request_queue *queue, struct uds_request **request_ptr, bool *waited_ptr) { - // Because of batching, we expect this to be the most common code path. + /* Because of batching, we expect this to be the most common code path. */ struct uds_request *request = poll_queues(queue); + if (request != NULL) { - // Return because we found a request + /* Return because we found a request */ *request_ptr = request; return true; } if (!READ_ONCE(queue->alive)) { - // Return because we see that shutdown is happening + /* Return because we see that shutdown is happening */ *request_ptr = NULL; return true; } - // Return indicating that we need to wait. + /* Return indicating that we need to wait. */ *request_ptr = NULL; *waited_ptr = true; return false; } -/**********************************************************************/ static void request_queue_worker(void *arg) { struct uds_request_queue *queue = (struct uds_request_queue *) arg; @@ -191,6 +171,7 @@ static void request_queue_worker(void *arg) for (;;) { struct uds_request *request; bool waited = false; + if (dormant) { /* * Sleep/wakeup protocol: @@ -229,53 +210,67 @@ static void request_queue_worker(void *arg) } if (likely(request != NULL)) { - // We got a request. + /* We got a request. */ current_batch++; queue->process_one(request); } else if (!READ_ONCE(queue->alive)) { - // We got no request and we know we are shutting down. + /* We got no request and we know we are shutting down. */ break; } if (dormant) { - // We've been roused from dormancy. Clear the flag so - // enqueuers can stop broadcasting (no fence needed for - // this transition). + /* + * We've been roused from dormancy. Clear the flag so + * enqueuers can stop broadcasting (no fence needed for + * this transition). + */ atomic_set(&queue->dormant, false); dormant = false; - // Reset the timeout back to the default since we don't - // know how long we've been asleep and we also want to - // be responsive to a new burst. + /* + * Reset the timeout back to the default since we don't + * know how long we've been asleep and we also want to + * be responsive to a new burst. + */ time_batch = DEFAULT_WAIT_TIME; } else if (waited) { - // We waited for this request to show up. Adjust the - // wait time if the last batch of requests was too - // small or too large.. + /* + * We waited for this request to show up. Adjust the + * wait time if the last batch of requests was too + * small or too large.. + */ if (current_batch < MINIMUM_BATCH) { - // Adjust the wait time if the last batch of - // requests was too small. + /* + * Adjust the wait time if the last batch of + * requests was too small. + */ time_batch += time_batch / 4; if (time_batch >= MAXIMUM_WAIT_TIME) { - // The timeout is getting long enough - // that we need to switch into dormant - // mode. + /* + * The timeout is getting long enough + * that we need to switch into dormant + * mode. + */ atomic_set(&queue->dormant, true); dormant = true; } } else if (current_batch > MAXIMUM_BATCH) { - // Adjust the wait time if the last batch of - // requests was too large. + /* + * Adjust the wait time if the last batch of + * requests was too large. + */ time_batch -= time_batch / 4; if (time_batch < MINIMUM_WAIT_TIME) { - // But if the producer is very fast or - // the scheduler doesn't wake up up - // promptly, waiting for very short - // times won't make the batches - // smaller. + /* + * But if the producer is very fast or + * the scheduler doesn't wake up + * promptly, waiting for very short + * times won't make the batches + * smaller. + */ time_batch = MINIMUM_WAIT_TIME; } } - // And we must now start a new batch count + /* And we must now start a new batch count */ current_batch = 0; } } @@ -287,10 +282,13 @@ static void request_queue_worker(void *arg) */ smp_rmb(); - // Process every request that is still in the queue, and never wait for - // any new requests to show up. + /* + * Process every request that is still in the queue, and never wait for + * any new requests to show up. + */ for (;;) { struct uds_request *request = poll_queues(queue); + if (request == NULL) { break; } @@ -298,7 +296,6 @@ static void request_queue_worker(void *arg) } } -/**********************************************************************/ int make_uds_request_queue(const char *queue_name, uds_request_queue_processor_t *process_one, struct uds_request_queue **queue_ptr) @@ -339,21 +336,20 @@ int make_uds_request_queue(const char *queue_name, return UDS_SUCCESS; } -/**********************************************************************/ static INLINE void wake_up_worker(struct uds_request_queue *queue) { - // This is the code sequence recommended in + /* This is the code sequence recommended in */ smp_mb(); if (waitqueue_active(&queue->wqhead)) { wake_up(&queue->wqhead); } } -/**********************************************************************/ void uds_request_queue_enqueue(struct uds_request_queue *queue, struct uds_request *request) { bool unbatched = request->unbatched; + funnel_queue_put(request->requeued ? queue->retry_queue : queue->main_queue, &request->request_queue_link); @@ -368,7 +364,6 @@ void uds_request_queue_enqueue(struct uds_request_queue *queue, } } -/**********************************************************************/ void uds_request_queue_finish(struct uds_request_queue *queue) { if (queue == NULL) { @@ -384,16 +379,18 @@ void uds_request_queue_finish(struct uds_request_queue *queue) */ smp_wmb(); - // Mark the queue as dead. + /* Mark the queue as dead. */ WRITE_ONCE(queue->alive, false); if (queue->started) { int result; - // Wake the worker so it notices that it should exit. + /* Wake the worker so it notices that it should exit. */ wake_up_worker(queue); - // Wait for the worker thread to finish processing any - // additional pending work and exit. + /* + * Wait for the worker thread to finish processing any + * additional pending work and exit. + */ result = uds_join_threads(queue->thread); if (result != UDS_SUCCESS) { uds_log_warning_strerror(result, diff --git a/uds/requestQueue.h b/vdo/request-queue.h similarity index 65% rename from uds/requestQueue.h rename to vdo/request-queue.h index fecf524c..8c4f4415 100644 --- a/uds/requestQueue.h +++ b/vdo/request-queue.h @@ -1,29 +1,13 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/krusty/src/uds/requestQueue.h#10 $ */ #ifndef REQUEST_QUEUE_H #define REQUEST_QUEUE_H #include "compiler.h" -#include "typeDefs.h" +#include "type-defs.h" #include "uds.h" struct uds_request_queue; diff --git a/vdo/slabDepotFormat.c b/vdo/slab-depot-format.c similarity index 68% rename from vdo/slabDepotFormat.c rename to vdo/slab-depot-format.c index aeb5afb4..fbe0bd9d 100644 --- a/vdo/slabDepotFormat.c +++ b/vdo/slab-depot-format.c @@ -1,25 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/slabDepotFormat.c#13 $ */ -#include "slabDepotFormat.h" +#include "slab-depot-format.h" #include "buffer.h" #include "logger.h" @@ -27,10 +11,10 @@ #include "constants.h" #include "header.h" -#include "numUtils.h" -#include "packedReferenceBlock.h" -#include "slabJournalFormat.h" -#include "statusCodes.h" +#include "num-utils.h" +#include "packed-reference-block.h" +#include "slab-journal-format.h" +#include "status-codes.h" #include "types.h" const struct header VDO_SLAB_DEPOT_HEADER_2_0 = { @@ -42,34 +26,47 @@ const struct header VDO_SLAB_DEPOT_HEADER_2_0 = { .size = sizeof(struct slab_depot_state_2_0), }; -/**********************************************************************/ +/** + * vdo_compute_slab_count() - Compute the number of slabs a depot with given + * parameters would have. + * @first_block: PBN of the first data block. + * @last_block: PBN of the last data block. + * @slab_size_shift: Exponent for the number of blocks per slab. + * + * Return: The number of slabs. + */ slab_count_t __must_check -compute_vdo_slab_count(physical_block_number_t first_block, +vdo_compute_slab_count(physical_block_number_t first_block, physical_block_number_t last_block, unsigned int slab_size_shift) { block_count_t data_blocks = last_block - first_block; + return (slab_count_t) (data_blocks >> slab_size_shift); } -/**********************************************************************/ -size_t get_vdo_slab_depot_encoded_size(void) +/** + * vdo_get_slab_depot_encoded_size() - Get the size of the encoded state of a + * slab depot. + * Return: The encoded size of the depot's state. + */ +size_t vdo_get_slab_depot_encoded_size(void) { return VDO_ENCODED_HEADER_SIZE + sizeof(struct slab_depot_state_2_0); } /** - * Encode a slab config into a buffer. + * encode_slab_config() - Encode a slab config into a buffer. + * @config: The config structure to encode. + * @buffer: A buffer positioned at the start of the encoding. * - * @param config The config structure to encode - * @param buffer A buffer positioned at the start of the encoding - * - * @return UDS_SUCCESS or an error code - **/ + * Return: UDS_SUCCESS or an error code. + */ static int encode_slab_config(const struct slab_config *config, struct buffer *buffer) { int result = put_uint64_le_into_buffer(buffer, config->slab_blocks); + if (result != UDS_SUCCESS) { return result; } @@ -90,28 +87,36 @@ static int encode_slab_config(const struct slab_config *config, } result = put_uint64_le_into_buffer(buffer, - config->slab_journal_flushing_threshold); + config->slab_journal_flushing_threshold); if (result != UDS_SUCCESS) { return result; } result = put_uint64_le_into_buffer(buffer, - config->slab_journal_blocking_threshold); + config->slab_journal_blocking_threshold); if (result != UDS_SUCCESS) { return result; } return put_uint64_le_into_buffer(buffer, - config->slab_journal_scrubbing_threshold); + config->slab_journal_scrubbing_threshold); } -/**********************************************************************/ -int encode_vdo_slab_depot_state_2_0(struct slab_depot_state_2_0 state, +/** + * vdo_encode_slab_depot_state_2_0() - Encode the state of a slab depot into a + * buffer. + * @state: The state to encode. + * @buffer: The buffer to encode into. + * + * Return: UDS_SUCCESS or an error. + */ +int vdo_encode_slab_depot_state_2_0(struct slab_depot_state_2_0 state, struct buffer *buffer) { size_t initial_length, encoded_size; - int result = encode_vdo_header(&VDO_SLAB_DEPOT_HEADER_2_0, buffer); + int result = vdo_encode_header(&VDO_SLAB_DEPOT_HEADER_2_0, buffer); + if (result != UDS_SUCCESS) { return result; } @@ -144,18 +149,18 @@ int encode_vdo_slab_depot_state_2_0(struct slab_depot_state_2_0 state, } /** - * Decode a slab config from a buffer. - * - * @param buffer A buffer positioned at the start of the encoding - * @param config The config structure to receive the decoded values + * decode_slab_config() - Decode a slab config from a buffer. + * @buffer: A buffer positioned at the start of the encoding. + * @config: The config structure to receive the decoded values. * - * @return UDS_SUCCESS or an error code - **/ + * Return: UDS_SUCCESS or an error code. + */ static int decode_slab_config(struct buffer *buffer, struct slab_config *config) { block_count_t count; int result = get_uint64_le_from_buffer(buffer, &count); + if (result != UDS_SUCCESS) { return result; } @@ -200,8 +205,15 @@ static int decode_slab_config(struct buffer *buffer, return UDS_SUCCESS; } -/**********************************************************************/ -int decode_vdo_slab_depot_state_2_0(struct buffer *buffer, +/** + * vdo_decode_slab_depot_state_2_0() - Decode slab depot component state + * version 2.0 from a buffer. + * @buffer: A buffer positioned at the start of the encoding. + * @state: The state structure to receive the decoded values. + * + * Return: UDS_SUCCESS or an error code. + */ +int vdo_decode_slab_depot_state_2_0(struct buffer *buffer, struct slab_depot_state_2_0 *state) { struct header header; @@ -211,12 +223,12 @@ int decode_vdo_slab_depot_state_2_0(struct buffer *buffer, physical_block_number_t first_block, last_block; zone_count_t zone_count; - result = decode_vdo_header(buffer, &header); + result = vdo_decode_header(buffer, &header); if (result != VDO_SUCCESS) { return result; } - result = validate_vdo_header(&VDO_SLAB_DEPOT_HEADER_2_0, &header, true, + result = vdo_validate_header(&VDO_SLAB_DEPOT_HEADER_2_0, &header, true, __func__); if (result != VDO_SUCCESS) { return result; @@ -261,8 +273,21 @@ int decode_vdo_slab_depot_state_2_0(struct buffer *buffer, return VDO_SUCCESS; } -/**********************************************************************/ -int configure_vdo_slab_depot(block_count_t block_count, +/** + * vdo_configure_slab_depot() - Configure the slab depot. + * @block_count: The number of blocks in the underlying storage. + * @first_block: The number of the first block that may be allocated. + * @slab_config: The configuration of a single slab. + * @zone_count: The number of zones the depot will use. + * @state: The state structure to be configured. + * + * Configures the slab_depot for the specified storage capacity, finding the + * number of data blocks that will fit and still leave room for the depot + * metadata, then return the saved state for that configuration. + * + * Return: VDO_SUCCESS or an error code. + */ +int vdo_configure_slab_depot(block_count_t block_count, physical_block_number_t first_block, struct slab_config slab_config, zone_count_t zone_count, @@ -273,13 +298,13 @@ int configure_vdo_slab_depot(block_count_t block_count, physical_block_number_t last_block; block_count_t slab_size = slab_config.slab_blocks; - uds_log_debug("slabDepot configure_vdo_slab_depot(block_count=%llu, first_block=%llu, slab_size=%llu, zone_count=%u)", + uds_log_debug("slabDepot vdo_configure_slab_depot(block_count=%llu, first_block=%llu, slab_size=%llu, zone_count=%u)", (unsigned long long) block_count, (unsigned long long) first_block, (unsigned long long) slab_size, zone_count); - // We do not allow runt slabs, so we waste up to a slab's worth. + /* We do not allow runt slabs, so we waste up to a slab's worth. */ slab_count = (block_count / slab_size); if (slab_count == 0) { return VDO_NO_SPACE; @@ -309,8 +334,16 @@ int configure_vdo_slab_depot(block_count_t block_count, return VDO_SUCCESS; } -/**********************************************************************/ -int configure_vdo_slab(block_count_t slab_size, +/** + * vdo_configure_slab() - Measure and initialize the configuration to use for + * each slab. + * @slab_size: The number of blocks per slab. + * @slab_journal_blocks: The number of blocks for the slab journal. + * @slab_config: The slab configuration to initialize. + * + * Return: VDO_SUCCESS or an error code. + */ +int vdo_configure_slab(block_count_t slab_size, block_count_t slab_journal_blocks, struct slab_config *slab_config) { @@ -332,7 +365,7 @@ int configure_vdo_slab(block_count_t slab_size, vdo_get_saved_reference_count_size(slab_size - slab_journal_blocks); meta_blocks = (ref_blocks + slab_journal_blocks); - // Make sure test code hasn't configured slabs to be too small. + /* Make sure test code hasn't configured slabs to be too small. */ if (meta_blocks >= slab_size) { return VDO_BAD_CONFIGURATION; } @@ -352,7 +385,7 @@ int configure_vdo_slab(block_count_t slab_size, */ data_blocks = slab_size - meta_blocks; if ((slab_size < 1024) && !is_power_of_2(data_blocks)) { - data_blocks = ((block_count_t) 1 << log_base_two(data_blocks)); + data_blocks = ((block_count_t) 1 << ilog2(data_blocks)); } /* @@ -362,7 +395,7 @@ int configure_vdo_slab(block_count_t slab_size, */ flushing_threshold = ((slab_journal_blocks * 3) + 3) / 4; /* - * The blocking threshold should be far enough from the the flushing + * The blocking threshold should be far enough from the flushing * threshold to not produce delays, but far enough from the end of the * journal to allow multiple successive recovery failures. */ @@ -394,8 +427,17 @@ int configure_vdo_slab(block_count_t slab_size, return VDO_SUCCESS; } -/**********************************************************************/ +/** + * vdo_get_saved_reference_count_size() - Get the number of blocks required to + * save a reference counts state + * covering the specified number of + * data blocks. + * @block_count: The number of physical data blocks that can be referenced. + * + * Return: The number of blocks required to save reference counts with the + * given block count. + */ block_count_t vdo_get_saved_reference_count_size(block_count_t block_count) { - return compute_bucket_count(block_count, COUNTS_PER_BLOCK); + return DIV_ROUND_UP(block_count, COUNTS_PER_BLOCK); } diff --git a/vdo/slab-depot-format.h b/vdo/slab-depot-format.h new file mode 100644 index 00000000..2d929ae2 --- /dev/null +++ b/vdo/slab-depot-format.h @@ -0,0 +1,51 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright Red Hat + */ + +#ifndef SLAB_DEPOT_FORMAT_H +#define SLAB_DEPOT_FORMAT_H + +#include "buffer.h" + +#include "header.h" +#include "types.h" + +struct slab_depot_state_2_0 { + struct slab_config slab_config; + physical_block_number_t first_block; + physical_block_number_t last_block; + zone_count_t zone_count; +} __packed; + +extern const struct header VDO_SLAB_DEPOT_HEADER_2_0; + +slab_count_t __must_check +vdo_compute_slab_count(physical_block_number_t first_block, + physical_block_number_t last_block, + unsigned int slab_size_shift); + +size_t __must_check vdo_get_slab_depot_encoded_size(void); + +int __must_check +vdo_encode_slab_depot_state_2_0(struct slab_depot_state_2_0 state, + struct buffer *buffer); + +int __must_check +vdo_decode_slab_depot_state_2_0(struct buffer *buffer, + struct slab_depot_state_2_0 *state); + +int __must_check vdo_configure_slab_depot(block_count_t block_count, + physical_block_number_t first_block, + struct slab_config slab_config, + zone_count_t zone_count, + struct slab_depot_state_2_0 *state); + +int __must_check vdo_configure_slab(block_count_t slab_size, + block_count_t slab_journal_blocks, + struct slab_config *slab_config); + +block_count_t __must_check +vdo_get_saved_reference_count_size(block_count_t block_count); + +#endif /* SLAB_DEPOT_FORMAT_H */ diff --git a/vdo/slab-depot.c b/vdo/slab-depot.c new file mode 100644 index 00000000..e0559d9c --- /dev/null +++ b/vdo/slab-depot.c @@ -0,0 +1,1076 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright Red Hat + */ + +#include "slab-depot.h" + +#include + +#include "logger.h" +#include "memory-alloc.h" +#include "permassert.h" + +#include "action-manager.h" +#include "admin-state.h" +#include "block-allocator.h" +#include "completion.h" +#include "constants.h" +#include "header.h" +#include "num-utils.h" +#include "read-only-notifier.h" +#include "ref-counts.h" +#include "slab.h" +#include "slab-depot-format.h" +#include "slab-depot.h" +#include "slab-journal.h" +#include "slab-iterator.h" +#include "slab-summary.h" +#include "status-codes.h" +#include "thread-config.h" +#include "types.h" +#include "vdo.h" + +/** + * vdo_calculate_slab_count() - Calculate the number of slabs a depot + * would have. + * @depot: The depot. + * + * Return: The number of slabs. + */ +static +slab_count_t vdo_calculate_slab_count(struct slab_depot *depot) +{ + return vdo_compute_slab_count(depot->first_block, depot->last_block, + depot->slab_size_shift); +} + +/** + * get_slab_iterator() - Get an iterator over all the slabs in the depot. + * @depot: The depot. + * + * Return: An iterator over the depot's slabs. + */ +static struct slab_iterator get_slab_iterator(struct slab_depot *depot) +{ + return vdo_iterate_slabs(depot->slabs, depot->slab_count - 1, 0, 1); +} + +/** + * allocate_slabs() - Allocate a new slab pointer array. + * @depot: The depot. + * @slab_count: The number of slabs the depot should have in the new array. + * + * Any existing slab pointers will be copied into the new array, and slabs + * will be allocated as needed. The newly allocated slabs will not be + * distributed for use by the block allocators. + * + * Return: VDO_SUCCESS or an error code. + */ +static int allocate_slabs(struct slab_depot *depot, slab_count_t slab_count) +{ + block_count_t slab_size; + bool resizing = false; + physical_block_number_t slab_origin; + block_count_t translation; + + int result = UDS_ALLOCATE(slab_count, + struct vdo_slab *, + "slab pointer array", + &depot->new_slabs); + if (result != VDO_SUCCESS) { + return result; + } + + if (depot->slabs != NULL) { + memcpy(depot->new_slabs, + depot->slabs, + depot->slab_count * sizeof(struct vdo_slab *)); + resizing = true; + } + + slab_size = vdo_get_slab_config(depot)->slab_blocks; + slab_origin = depot->first_block + (depot->slab_count * slab_size); + + /* The translation between allocator partition PBNs and layer PBNs. */ + translation = depot->origin - depot->first_block; + depot->new_slab_count = depot->slab_count; + while (depot->new_slab_count < slab_count) { + struct block_allocator *allocator = + depot->allocators[depot->new_slab_count % + depot->zone_count]; + struct vdo_slab **slab_ptr = + &depot->new_slabs[depot->new_slab_count]; + result = vdo_make_slab(slab_origin, + allocator, + translation, + depot->vdo->recovery_journal, + depot->new_slab_count, + resizing, + slab_ptr); + if (result != VDO_SUCCESS) { + return result; + } + /* + * Increment here to ensure that vdo_abandon_new_slabs will + * clean up correctly. + */ + depot->new_slab_count++; + + slab_origin += slab_size; + } + + return VDO_SUCCESS; +} + +/** + * vdo_abandon_new_slabs() - Abandon any new slabs in this depot, freeing them + * as needed. + * @depot: The depot. + */ +void vdo_abandon_new_slabs(struct slab_depot *depot) +{ + slab_count_t i; + + if (depot->new_slabs == NULL) { + return; + } + + for (i = depot->slab_count; i < depot->new_slab_count; i++) { + vdo_free_slab(UDS_FORGET(depot->new_slabs[i])); + } + depot->new_slab_count = 0; + depot->new_size = 0; + UDS_FREE(UDS_FORGET(depot->new_slabs)); +} + +/** + * get_allocator_thread_id() - Get the ID of the thread on which a given + * allocator operates. + * + * Implements vdo_zone_thread_getter. + */ +static thread_id_t get_allocator_thread_id(void *context, + zone_count_t zone_number) +{ + return vdo_get_block_allocator_for_zone(context, zone_number)->thread_id; +} + +/** + * prepare_for_tail_block_commit() - Prepare to commit oldest tail blocks. + * + * Implements vdo_action_preamble. + */ +static void prepare_for_tail_block_commit(void *context, + struct vdo_completion *parent) +{ + struct slab_depot *depot = context; + + depot->active_release_request = depot->new_release_request; + vdo_complete_completion(parent); +} + +/** + * schedule_tail_block_commit() - Schedule a tail block commit if necessary. + * + * This method should not be called directly. Rather, call + * vdo_schedule_default_action() on the depot's action manager. + * + * Implements vdo_action_scheduler. + */ +static bool schedule_tail_block_commit(void *context) +{ + struct slab_depot *depot = context; + + if (depot->new_release_request == depot->active_release_request) { + return false; + } + + return vdo_schedule_action(depot->action_manager, + prepare_for_tail_block_commit, + vdo_release_tail_block_locks, + NULL, + NULL); +} + +/** + * allocate_components() - Allocate those components of the slab depot which + * are needed only at load time, not at format time. + * @depot: The depot. + * @summary_partition: The partition which holds the slab summary. + * + * Return: VDO_SUCCESS or an error. + */ +static int allocate_components(struct slab_depot *depot, + struct partition *summary_partition) +{ + zone_count_t zone; + slab_count_t slab_count, i; + const struct thread_config *thread_config = depot->vdo->thread_config; + int result = vdo_make_action_manager(depot->zone_count, + get_allocator_thread_id, + thread_config->journal_thread, + depot, + schedule_tail_block_commit, + depot->vdo, + &depot->action_manager); + if (result != VDO_SUCCESS) { + return result; + } + + depot->origin = depot->first_block; + + result = vdo_make_slab_summary(depot->vdo, + summary_partition, + thread_config, + depot->slab_size_shift, + depot->slab_config.data_blocks, + depot->vdo->read_only_notifier, + &depot->slab_summary); + if (result != VDO_SUCCESS) { + return result; + } + + slab_count = vdo_calculate_slab_count(depot); + if (thread_config->physical_zone_count > slab_count) { + return uds_log_error_strerror(VDO_BAD_CONFIGURATION, + "%u physical zones exceeds slab count %u", + thread_config->physical_zone_count, + slab_count); + } + + /* Allocate the block allocators. */ + for (zone = 0; zone < depot->zone_count; zone++) { + thread_id_t thread_id = + vdo_get_physical_zone_thread(thread_config, zone); + result = vdo_make_block_allocator(depot, + zone, + thread_id, + depot->vdo->states.vdo.nonce, + VIO_POOL_SIZE, + depot->vdo, + depot->vdo->read_only_notifier, + &depot->allocators[zone]); + if (result != VDO_SUCCESS) { + return result; + } + } + + /* Allocate slabs. */ + result = allocate_slabs(depot, slab_count); + if (result != VDO_SUCCESS) { + return result; + } + + /* Use the new slabs. */ + for (i = depot->slab_count; i < depot->new_slab_count; i++) { + struct vdo_slab *slab = depot->new_slabs[i]; + + vdo_register_slab_with_allocator(slab->allocator, slab); + WRITE_ONCE(depot->slab_count, depot->slab_count + 1); + } + + depot->slabs = depot->new_slabs; + depot->new_slabs = NULL; + depot->new_slab_count = 0; + + return VDO_SUCCESS; +} + +/** + * vdo_decode_slab_depot() - Make a slab depot and configure it with the state + * read from the super block. + * @state: The slab depot state from the super block. + * @vdo: The VDO which will own the depot. + * @summary_partition: The partition which holds the slab summary. + * @depot_ptr: A pointer to hold the depot. + * + * Return: A success or error code. + */ +int vdo_decode_slab_depot(struct slab_depot_state_2_0 state, + struct vdo *vdo, + struct partition *summary_partition, + struct slab_depot **depot_ptr) +{ + unsigned int slab_size_shift; + struct slab_depot *depot; + int result; + + /* + * Calculate the bit shift for efficiently mapping block numbers to + * slabs. Using a shift requires that the slab size be a power of two. + */ + block_count_t slab_size = state.slab_config.slab_blocks; + + if (!is_power_of_2(slab_size)) { + return uds_log_error_strerror(UDS_INVALID_ARGUMENT, + "slab size must be a power of two"); + } + slab_size_shift = ilog2(slab_size); + + result = UDS_ALLOCATE_EXTENDED(struct slab_depot, + vdo->thread_config->physical_zone_count, + struct block_allocator *, + __func__, + &depot); + if (result != VDO_SUCCESS) { + return result; + } + + depot->vdo = vdo; + depot->old_zone_count = state.zone_count; + depot->zone_count = vdo->thread_config->physical_zone_count; + depot->slab_config = state.slab_config; + depot->first_block = state.first_block; + depot->last_block = state.last_block; + depot->slab_size_shift = slab_size_shift; + + result = allocate_components(depot, summary_partition); + if (result != VDO_SUCCESS) { + vdo_free_slab_depot(depot); + return result; + } + + *depot_ptr = depot; + return VDO_SUCCESS; +} + +/** + * vdo_free_slab_depot() - Destroy a slab depot. + * @depot: The depot to destroy. + */ +void vdo_free_slab_depot(struct slab_depot *depot) +{ + zone_count_t zone = 0; + + if (depot == NULL) { + return; + } + + vdo_abandon_new_slabs(depot); + + for (zone = 0; zone < depot->zone_count; zone++) { + vdo_free_block_allocator(UDS_FORGET(depot->allocators[zone])); + } + + if (depot->slabs != NULL) { + slab_count_t i; + + for (i = 0; i < depot->slab_count; i++) { + vdo_free_slab(UDS_FORGET(depot->slabs[i])); + } + } + + UDS_FREE(UDS_FORGET(depot->slabs)); + UDS_FREE(UDS_FORGET(depot->action_manager)); + vdo_free_slab_summary(UDS_FORGET(depot->slab_summary)); + UDS_FREE(depot); +} + +/** + * vdo_record_slab_depot() - Record the state of a slab depot for encoding + * into the super block. + * @depot: The depot to encode. + * + * Return: The depot state. + */ +struct slab_depot_state_2_0 vdo_record_slab_depot(const struct slab_depot *depot) +{ + /* + * If this depot is currently using 0 zones, it must have been + * synchronously loaded by a tool and is now being saved. We + * did not load and combine the slab summary, so we still need + * to do that next time we load with the old zone count rather + * than 0. + */ + struct slab_depot_state_2_0 state; + zone_count_t zones_to_record = depot->zone_count; + + if (depot->zone_count == 0) { + zones_to_record = depot->old_zone_count; + } + + state = (struct slab_depot_state_2_0) { + .slab_config = depot->slab_config, + .first_block = depot->first_block, + .last_block = depot->last_block, + .zone_count = zones_to_record, + }; + + return state; +} + +/** + * vdo_allocate_slab_ref_counts() - Allocate the ref_counts for all slabs in + * the depot. + * @depot: The depot whose ref_counts need allocation. + * + * Context: This method may be called only before entering normal operation + * from the load thread. + * + * Return: VDO_SUCCESS or an error. + */ +int vdo_allocate_slab_ref_counts(struct slab_depot *depot) +{ + struct slab_iterator iterator = get_slab_iterator(depot); + + while (vdo_has_next_slab(&iterator)) { + int result = + vdo_allocate_ref_counts_for_slab(vdo_next_slab(&iterator)); + if (result != VDO_SUCCESS) { + return result; + } + } + + return VDO_SUCCESS; +} + +/** + * vdo_get_block_allocator_for_zone() - Get the block allocator for a + * specified physical zone from a depot. + * @depot: The depot. + * @zone_number: The physical zone. + * + * Return: The block allocator for the specified zone. + */ +struct block_allocator *vdo_get_block_allocator_for_zone(struct slab_depot *depot, + zone_count_t zone_number) +{ + return depot->allocators[zone_number]; +} + +/** + * vdo_get_slab_number() - Get the number of the slab that contains a + * specified block. + * @depot: The slab depot. + * @pbn: The physical block number. + * @slab_number_ptr: A pointer to hold the slab number. + * + * Return: VDO_SUCCESS or an error. + */ +static +int vdo_get_slab_number(const struct slab_depot *depot, + physical_block_number_t pbn, + slab_count_t *slab_number_ptr) +{ + slab_count_t slab_number; + + if (pbn < depot->first_block) { + return VDO_OUT_OF_RANGE; + } + + slab_number = (pbn - depot->first_block) >> depot->slab_size_shift; + if (slab_number >= depot->slab_count) { + return VDO_OUT_OF_RANGE; + } + + *slab_number_ptr = slab_number; + return VDO_SUCCESS; +} + +/** + * vdo_get_slab() - Get the slab object for the slab that contains a specified + * block. + * @depot: The slab depot. + * @pbn: The physical block number. + * + * Will put the VDO in read-only mode if the PBN is not a valid data block nor + * the zero block. + * + * Return: The slab containing the block, or NULL if the block number is the + * zero block or otherwise out of range. + */ +struct vdo_slab *vdo_get_slab(const struct slab_depot *depot, + physical_block_number_t pbn) +{ + slab_count_t slab_number; + int result; + + if (pbn == VDO_ZERO_BLOCK) { + return NULL; + } + + result = vdo_get_slab_number(depot, pbn, &slab_number); + if (result != VDO_SUCCESS) { + vdo_enter_read_only_mode(depot->vdo->read_only_notifier, result); + return NULL; + } + + return depot->slabs[slab_number]; +} + +/** + * vdo_get_slab_journal() - Get the slab journal for the slab that contains a + * specified block. + * @depot: The slab depot. + * @pbn: The physical block number within the block depot partition of any + * block in the slab. + * + * Return: The slab journal of the slab containing the block, or NULL if the + * block number is for the zero block or otherwise out of range. + */ +struct slab_journal *vdo_get_slab_journal(const struct slab_depot *depot, + physical_block_number_t pbn) +{ + struct vdo_slab *slab = vdo_get_slab(depot, pbn); + + return ((slab != NULL) ? slab->journal : NULL); +} + +/** + * vdo_get_increment_limit() - Determine how many new references a block can + * acquire. + * @depot: The slab depot. + * @pbn: The physical block number that is being queried. + * + * Context: This method must be called from the physical zone thread of the + * PBN. + * + * Return: The number of available references. + */ +uint8_t vdo_get_increment_limit(struct slab_depot *depot, + physical_block_number_t pbn) +{ + struct vdo_slab *slab = vdo_get_slab(depot, pbn); + + if ((slab == NULL) || vdo_is_unrecovered_slab(slab)) { + return 0; + } + + return vdo_get_available_references(slab->reference_counts, pbn); +} + +/** + * vdo_is_physical_data_block() - Determine whether the given PBN refers to a + * data block. + * @depot: The depot. + * @pbn: The physical block number to ask about. + * + * Return: True if the PBN corresponds to a data block. + */ +bool vdo_is_physical_data_block(const struct slab_depot *depot, + physical_block_number_t pbn) +{ + slab_count_t slab_number; + slab_block_number sbn; + int result; + + if (pbn == VDO_ZERO_BLOCK) { + return true; + } + + if (vdo_get_slab_number(depot, pbn, &slab_number) != VDO_SUCCESS) { + return false; + } + + result = vdo_slab_block_number_from_pbn(depot->slabs[slab_number], + pbn, &sbn); + return (result == VDO_SUCCESS); +} + +/** + * vdo_get_slab_depot_allocated_blocks() - Get the total number of data blocks + * allocated across all the slabs in + * the depot. + * @depot: The slab depot. + * + * This is the total number of blocks with a non-zero reference count. + * + * Context: This may be called from any thread. + * + * Return: The total number of blocks with a non-zero reference count. + */ +block_count_t vdo_get_slab_depot_allocated_blocks(const struct slab_depot *depot) +{ + block_count_t total = 0; + zone_count_t zone; + + for (zone = 0; zone < depot->zone_count; zone++) { + /* The allocators are responsible for thread safety. */ + total += READ_ONCE(depot->allocators[zone]->allocated_blocks); + } + return total; +} + +/** + * vdo_get_slab_depot_data_blocks() - Get the total number of data blocks in + * all the slabs in the depot. + * @depot: The slab depot. + * + * Context: This may be called from any thread. + * + * Return: The total number of data blocks in all slabs. + */ +block_count_t vdo_get_slab_depot_data_blocks(const struct slab_depot *depot) +{ + return (READ_ONCE(depot->slab_count) * depot->slab_config.data_blocks); +} + +/** + * vdo_get_slab_depot_unrecovered_slab_count() - Get the total number of + * unrecovered slabs in the + * depot. + * @depot: The slab depot. + * + * This is the total number of unrecovered slabs from all zones. + * + * Context: This may be called from any thread. + * + * Return: The total number of slabs that are unrecovered. + */ +static +slab_count_t vdo_get_slab_depot_unrecovered_slab_count(const struct slab_depot *depot) +{ + slab_count_t total = 0; + zone_count_t zone; + + for (zone = 0; zone < depot->zone_count; zone++) { + struct block_allocator *allocator = depot->allocators[zone]; + /* The allocators are responsible for thread safety. */ + total += vdo_get_scrubber_slab_count(allocator->slab_scrubber); + } + return total; +} + +/** + * start_depot_load() - The preamble of a load operation which loads the slab + * summary. + * + * Implements vdo_action_preamble. + */ +static void start_depot_load(void *context, struct vdo_completion *parent) +{ + struct slab_depot *depot = context; + + vdo_load_slab_summary(depot->slab_summary, + vdo_get_current_manager_operation(depot->action_manager), + depot->old_zone_count, + parent); +} + +/** + * vdo_load_slab_depot() - Asynchronously load any slab depot state that isn't + * included in the super_block component. + * @depot: The depot to load. + * @operation: The type of load to perform. + * @parent: The completion to finish when the load is complete. + * @context: Additional context for the load operation; may be NULL. + * + * This method may be called only before entering normal operation from the + * load thread. + */ +void vdo_load_slab_depot(struct slab_depot *depot, + const struct admin_state_code *operation, + struct vdo_completion *parent, + void *context) +{ + if (vdo_assert_load_operation(operation, parent)) { + vdo_schedule_operation_with_context(depot->action_manager, + operation, + start_depot_load, + vdo_load_block_allocator, + NULL, + context, + parent); + } +} + +/** + * vdo_prepare_slab_depot_to_allocate() - Prepare the slab depot to come + * online and start allocating blocks. + * @depot: The depot to prepare. + * @load_type: The load type. + * @parent: The completion to finish when the operation is complete. + * + * This method may be called only before entering normal operation from the + * load thread. It must be called before allocation may proceed. + */ +void vdo_prepare_slab_depot_to_allocate(struct slab_depot *depot, + enum slab_depot_load_type load_type, + struct vdo_completion *parent) +{ + depot->load_type = load_type; + atomic_set(&depot->zones_to_scrub, depot->zone_count); + vdo_schedule_action(depot->action_manager, + NULL, + vdo_prepare_block_allocator_to_allocate, + NULL, + parent); +} + +/** + * vdo_update_slab_depot_size() - Update the slab depot to reflect its new + * size in memory. + * @depot: The depot to update. + * + * This size is saved to disk as part of the super block. + */ +void vdo_update_slab_depot_size(struct slab_depot *depot) +{ + depot->last_block = depot->new_last_block; +} + +/** + * vdo_prepare_to_grow_slab_depot() - Allocate new memory needed for a resize + * of a slab depot to the given size. + * @depot: The depot to prepare to resize. + * @new_size: The number of blocks in the new depot. + * + * Return: VDO_SUCCESS or an error. + */ +int vdo_prepare_to_grow_slab_depot(struct slab_depot *depot, block_count_t new_size) +{ + struct slab_depot_state_2_0 new_state; + int result; + slab_count_t new_slab_count; + + if ((new_size >> depot->slab_size_shift) <= depot->slab_count) { + return VDO_INCREMENT_TOO_SMALL; + } + + /* Generate the depot configuration for the new block count. */ + result = vdo_configure_slab_depot(new_size, + depot->first_block, + depot->slab_config, + depot->zone_count, + &new_state); + if (result != VDO_SUCCESS) { + return result; + } + + new_slab_count = vdo_compute_slab_count(depot->first_block, + new_state.last_block, + depot->slab_size_shift); + if (new_slab_count <= depot->slab_count) { + return uds_log_error_strerror(VDO_INCREMENT_TOO_SMALL, + "Depot can only grow"); + } + if (new_slab_count == depot->new_slab_count) { + /* Check it out, we've already got all the new slabs allocated! */ + return VDO_SUCCESS; + } + + vdo_abandon_new_slabs(depot); + result = allocate_slabs(depot, new_slab_count); + if (result != VDO_SUCCESS) { + vdo_abandon_new_slabs(depot); + return result; + } + + depot->new_size = new_size; + depot->old_last_block = depot->last_block; + depot->new_last_block = new_state.last_block; + + return VDO_SUCCESS; +} + +/** + * finish_registration() - Finish registering new slabs now that all of the + * allocators have received their new slabs. + * + * Implements vdo_action_conclusion. + */ +static int finish_registration(void *context) +{ + struct slab_depot *depot = context; + + WRITE_ONCE(depot->slab_count, depot->new_slab_count); + UDS_FREE(depot->slabs); + depot->slabs = depot->new_slabs; + depot->new_slabs = NULL; + depot->new_slab_count = 0; + return VDO_SUCCESS; +} + +/** + * vdo_use_new_slabs() - Use the new slabs allocated for resize. + * @depot: The depot. + * @parent: The object to notify when complete. + */ +void vdo_use_new_slabs(struct slab_depot *depot, struct vdo_completion *parent) +{ + ASSERT_LOG_ONLY(depot->new_slabs != NULL, "Must have new slabs to use"); + vdo_schedule_operation(depot->action_manager, + VDO_ADMIN_STATE_SUSPENDED_OPERATION, + NULL, + vdo_register_new_slabs_for_allocator, + finish_registration, + parent); +} + +/** + * vdo_drain_slab_depot() - Drain all slab depot I/O. + * @depot: The depot to drain. + * @operation: The drain operation (flush, rebuild, suspend, or save). + * @parent: The completion to finish when the drain is complete. + * + * If saving, or flushing, all dirty depot metadata will be written out. If + * saving or suspending, the depot will be left in a suspended state. + */ +void vdo_drain_slab_depot(struct slab_depot *depot, + const struct admin_state_code *operation, + struct vdo_completion *parent) +{ + vdo_schedule_operation(depot->action_manager, + operation, + NULL, + vdo_drain_block_allocator, + NULL, + parent); +} + +/** + * vdo_resume_slab_depot() - Resume a suspended slab depot. + * @depot: The depot to resume. + * @parent: The completion to finish when the depot has resumed. + */ +void vdo_resume_slab_depot(struct slab_depot *depot, struct vdo_completion *parent) +{ + if (vdo_is_read_only(depot->vdo->read_only_notifier)) { + vdo_finish_completion(parent, VDO_READ_ONLY); + return; + } + + vdo_schedule_operation(depot->action_manager, + VDO_ADMIN_STATE_RESUMING, + NULL, + vdo_resume_block_allocator, + NULL, + parent); +} + +/** + * vdo_commit_oldest_slab_journal_tail_blocks() - Commit all dirty tail blocks + * which are locking a given + * recovery journal block. + * @depot: The depot. + * @recovery_block_number: The sequence number of the recovery journal block + * whose locks should be released. + * + * Context: This method must be called from the journal zone thread. + */ +void +vdo_commit_oldest_slab_journal_tail_blocks(struct slab_depot *depot, + sequence_number_t recovery_block_number) +{ + if (depot == NULL) { + return; + } + + depot->new_release_request = recovery_block_number; + vdo_schedule_default_action(depot->action_manager); +} + +/** + * vdo_get_slab_config() - Get the slab_config of a depot. + * @depot: The slab depot. + * + * Return: The slab configuration of the specified depot. + */ +const struct slab_config *vdo_get_slab_config(const struct slab_depot *depot) +{ + return &depot->slab_config; +} + +/** + * vdo_get_slab_summary() - Get the slab summary. + * @depot: The slab depot. + * + * Return: The slab summary. + */ +struct slab_summary *vdo_get_slab_summary(const struct slab_depot *depot) +{ + return depot->slab_summary; +} + +/** + * vdo_scrub_all_unrecovered_slabs() - Scrub all unrecovered slabs. + * @depot: The depot to scrub. + * @parent: The object to notify when scrubbing has been launched for all + * zones. + */ +void vdo_scrub_all_unrecovered_slabs(struct slab_depot *depot, + struct vdo_completion *parent) +{ + vdo_schedule_action(depot->action_manager, + NULL, + vdo_scrub_all_unrecovered_slabs_in_zone, + NULL, + parent); +} + +/** + * vdo_notify_zone_finished_scrubbing() - Notify a slab depot that one of its + * allocators has finished scrubbing + * slabs. + * @completion: A completion whose parent must be a slab depot. + * + * This method should only be called if the scrubbing was successful. This + * callback is registered by each block allocator in + * vdo_scrub_all_unrecovered_slabs_in_zone(). + * + */ +void vdo_notify_zone_finished_scrubbing(struct vdo_completion *completion) +{ + enum vdo_state prior_state; + + struct slab_depot *depot = completion->parent; + + if (atomic_add_return(-1, &depot->zones_to_scrub) > 0) { + return; + } + + /* We're the last! */ + prior_state = atomic_cmpxchg(&depot->vdo->state, + VDO_RECOVERING, VDO_DIRTY); + /* + * To be safe, even if the CAS failed, ensure anything that follows is + * ordered with respect to whatever state change did happen. + */ + smp_mb__after_atomic(); + + /* + * We must check the VDO state here and not the depot's + * read_only_notifier since the compare-swap-above could have + * failed due to a read-only entry which our own thread does not + * yet know about. + */ + if (prior_state == VDO_DIRTY) { + uds_log_info("VDO commencing normal operation"); + } else if (prior_state == VDO_RECOVERING) { + uds_log_info("Exiting recovery mode"); + } +} + +/** + * vdo_get_slab_depot_new_size() - Get the physical size to which this depot + * is prepared to grow. + * @depot: The slab depot. + * + * Return: The new number of blocks the depot will be grown to, or 0 if the + * depot is not prepared to grow. + */ +block_count_t vdo_get_slab_depot_new_size(const struct slab_depot *depot) +{ + return (depot->new_slabs == NULL) ? 0 : depot->new_size; +} + + +/** + * get_depot_block_allocator_statistics() - Get the total of the statistics + * from all the block allocators in + * the depot. + * @depot: The slab depot. + * + * Return: The statistics from all block allocators in the depot. + */ +static struct block_allocator_statistics __must_check +get_depot_block_allocator_statistics(const struct slab_depot *depot) +{ + struct block_allocator_statistics totals; + zone_count_t zone; + + memset(&totals, 0, sizeof(totals)); + + for (zone = 0; zone < depot->zone_count; zone++) { + struct block_allocator *allocator = depot->allocators[zone]; + struct block_allocator_statistics stats = + vdo_get_block_allocator_statistics(allocator); + totals.slab_count += stats.slab_count; + totals.slabs_opened += stats.slabs_opened; + totals.slabs_reopened += stats.slabs_reopened; + } + + return totals; +} + +/** + * get_depot_ref_counts_statistics() - Get the cumulative ref_counts + * statistics for the depot. + * @depot: The slab depot. + * + * Return: The cumulative statistics for all ref_counts in the depot. + */ +static struct ref_counts_statistics __must_check +get_depot_ref_counts_statistics(const struct slab_depot *depot) +{ + struct ref_counts_statistics depot_stats; + zone_count_t zone; + + memset(&depot_stats, 0, sizeof(depot_stats)); + + for (zone = 0; zone < depot->zone_count; zone++) { + struct block_allocator *allocator = depot->allocators[zone]; + struct ref_counts_statistics stats = + vdo_get_ref_counts_statistics(allocator); + depot_stats.blocks_written += stats.blocks_written; + } + + return depot_stats; +} + +/** + * get_depot_slab_journal_statistics() - Get the aggregated slab journal + * statistics for the depot. + * @depot: The slab depot. + * + * Return: The aggregated statistics for all slab journals in the depot. + */ +static struct slab_journal_statistics __must_check +get_depot_slab_journal_statistics(const struct slab_depot *depot) +{ + struct slab_journal_statistics depot_stats; + zone_count_t zone; + + memset(&depot_stats, 0, sizeof(depot_stats)); + + for (zone = 0; zone < depot->zone_count; zone++) { + struct block_allocator *allocator = depot->allocators[zone]; + struct slab_journal_statistics stats = + vdo_get_slab_journal_statistics(allocator); + depot_stats.disk_full_count += stats.disk_full_count; + depot_stats.flush_count += stats.flush_count; + depot_stats.blocked_count += stats.blocked_count; + depot_stats.blocks_written += stats.blocks_written; + depot_stats.tail_busy_count += stats.tail_busy_count; + } + + return depot_stats; +} + +/** + * vdo_get_slab_depot_statistics() - Get all the vdo_statistics fields that + * are properties of the slab depot. + * @depot: The slab depot. + * @stats: The vdo statistics structure to partially fill. + */ +void vdo_get_slab_depot_statistics(const struct slab_depot *depot, + struct vdo_statistics *stats) +{ + slab_count_t slab_count = READ_ONCE(depot->slab_count); + slab_count_t unrecovered = + vdo_get_slab_depot_unrecovered_slab_count(depot); + + stats->recovery_percentage = + (slab_count - unrecovered) * 100 / slab_count; + stats->allocator = get_depot_block_allocator_statistics(depot); + stats->ref_counts = get_depot_ref_counts_statistics(depot); + stats->slab_journal = get_depot_slab_journal_statistics(depot); + stats->slab_summary = + vdo_get_slab_summary_statistics(depot->slab_summary); +} + +/** + * vdo_dump_slab_depot() - Dump the slab depot, in a thread-unsafe fashion. + * @depot: The slab depot. + */ +void vdo_dump_slab_depot(const struct slab_depot *depot) +{ + uds_log_info("vdo slab depot"); + uds_log_info(" zone_count=%u old_zone_count=%u slabCount=%u active_release_request=%llu new_release_request=%llu", + (unsigned int) depot->zone_count, + (unsigned int) depot->old_zone_count, + READ_ONCE(depot->slab_count), + (unsigned long long) depot->active_release_request, + (unsigned long long) depot->new_release_request); +} diff --git a/vdo/slab-depot.h b/vdo/slab-depot.h new file mode 100644 index 00000000..507b593a --- /dev/null +++ b/vdo/slab-depot.h @@ -0,0 +1,164 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright Red Hat + */ + +#ifndef SLAB_DEPOT_H +#define SLAB_DEPOT_H + +#include +#include "admin-state.h" +#include "slab-depot-format.h" +#include "statistics.h" +#include "types.h" +#include "vdo-layout.h" + +/* + * A slab_depot is responsible for managing all of the slabs and block + * allocators of a VDO. It has a single array of slabs in order to eliminate + * the need for additional math in order to compute which physical zone a PBN + * is in. It also has a block_allocator per zone. + * + * Load operations are required to be performed on a single thread. Normal + * operations are assumed to be performed in the appropriate zone. Allocations + * and reference count updates must be done from the thread of their physical + * zone. Requests to commit slab journal tail blocks from the recovery journal + * must be done on the journal zone thread. Save operations are required to be + * launched from the same thread as the original load operation. + */ + +enum slab_depot_load_type { + VDO_SLAB_DEPOT_NORMAL_LOAD, + VDO_SLAB_DEPOT_RECOVERY_LOAD, + VDO_SLAB_DEPOT_REBUILD_LOAD +}; + +struct slab_depot { + zone_count_t zone_count; + zone_count_t old_zone_count; + struct vdo *vdo; + struct slab_config slab_config; + struct slab_summary *slab_summary; + struct action_manager *action_manager; + + physical_block_number_t first_block; + physical_block_number_t last_block; + physical_block_number_t origin; + + /* slab_size == (1 << slab_size_shift) */ + unsigned int slab_size_shift; + + /* Determines how slabs should be queued during load */ + enum slab_depot_load_type load_type; + + /* The state for notifying slab journals to release recovery journal */ + sequence_number_t active_release_request; + sequence_number_t new_release_request; + + /* State variables for scrubbing complete handling */ + atomic_t zones_to_scrub; + + /* Array of pointers to individually allocated slabs */ + struct vdo_slab **slabs; + /* The number of slabs currently allocated and stored in 'slabs' */ + slab_count_t slab_count; + + /* Array of pointers to a larger set of slabs (used during resize) */ + struct vdo_slab **new_slabs; + /* The number of slabs currently allocated and stored in 'new_slabs' */ + slab_count_t new_slab_count; + /* The size that 'new_slabs' was allocated for */ + block_count_t new_size; + + /* The last block before resize, for rollback */ + physical_block_number_t old_last_block; + /* The last block after resize, for resize */ + physical_block_number_t new_last_block; + + /* The block allocators for this depot */ + struct block_allocator *allocators[]; +}; + +int __must_check +vdo_decode_slab_depot(struct slab_depot_state_2_0 state, + struct vdo *vdo, + struct partition *summary_partition, + struct slab_depot **depot_ptr); + +void vdo_free_slab_depot(struct slab_depot *depot); + +struct slab_depot_state_2_0 __must_check +vdo_record_slab_depot(const struct slab_depot *depot); + +int __must_check vdo_allocate_slab_ref_counts(struct slab_depot *depot); + +struct block_allocator * __must_check +vdo_get_block_allocator_for_zone(struct slab_depot *depot, + zone_count_t zone_number); + +struct vdo_slab * __must_check +vdo_get_slab(const struct slab_depot *depot, physical_block_number_t pbn); + +struct slab_journal * __must_check +vdo_get_slab_journal(const struct slab_depot *depot, physical_block_number_t pbn); + +uint8_t __must_check +vdo_get_increment_limit(struct slab_depot *depot, physical_block_number_t pbn); + +bool __must_check +vdo_is_physical_data_block(const struct slab_depot *depot, + physical_block_number_t pbn); + +block_count_t __must_check +vdo_get_slab_depot_allocated_blocks(const struct slab_depot *depot); + +block_count_t __must_check +vdo_get_slab_depot_data_blocks(const struct slab_depot *depot); + +void vdo_get_slab_depot_statistics(const struct slab_depot *depot, + struct vdo_statistics *stats); + +void vdo_load_slab_depot(struct slab_depot *depot, + const struct admin_state_code *operation, + struct vdo_completion *parent, + void *context); + +void vdo_prepare_slab_depot_to_allocate(struct slab_depot *depot, + enum slab_depot_load_type load_type, + struct vdo_completion *parent); + +void vdo_update_slab_depot_size(struct slab_depot *depot); + +int __must_check +vdo_prepare_to_grow_slab_depot(struct slab_depot *depot, block_count_t new_size); + +void vdo_use_new_slabs(struct slab_depot *depot, struct vdo_completion *parent); + +void vdo_abandon_new_slabs(struct slab_depot *depot); + +void vdo_drain_slab_depot(struct slab_depot *depot, + const struct admin_state_code *operation, + struct vdo_completion *parent); + +void vdo_resume_slab_depot(struct slab_depot *depot, struct vdo_completion *parent); + +void +vdo_commit_oldest_slab_journal_tail_blocks(struct slab_depot *depot, + sequence_number_t recovery_block_number); + +const struct slab_config * __must_check +vdo_get_slab_config(const struct slab_depot *depot); + +struct slab_summary * __must_check +vdo_get_slab_summary(const struct slab_depot *depot); + +void vdo_scrub_all_unrecovered_slabs(struct slab_depot *depot, + struct vdo_completion *parent); + +block_count_t __must_check vdo_get_slab_depot_new_size(const struct slab_depot *depot); + +void vdo_dump_slab_depot(const struct slab_depot *depot); + +void vdo_notify_zone_finished_scrubbing(struct vdo_completion *completion); + +#endif /* SLAB_DEPOT_H */ diff --git a/vdo/slab-iterator.h b/vdo/slab-iterator.h new file mode 100644 index 00000000..c80f607a --- /dev/null +++ b/vdo/slab-iterator.h @@ -0,0 +1,81 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright Red Hat + */ + +#ifndef SLAB_ITERATOR_H +#define SLAB_ITERATOR_H + +#include "slab.h" +#include "types.h" + +/* + * A slab_iterator is a structure for iterating over a set of slabs. + */ +struct slab_iterator { + struct vdo_slab **slabs; + struct vdo_slab *next; + slab_count_t end; + slab_count_t stride; +}; + +/** + * vdo_iterate_slabs() - Return a slab_iterator initialized to iterate + * over an array of slabs with a given stride. + * @slabs: The array of slabs. + * @start: The number of the slab to start iterating from. + * @end: The number of the last slab which may be returned. + * @stride: The difference in slab number between successive slabs. + * + * Iteration always occurs from higher to lower numbered slabs. + * + * Return: An initialized iterator structure. + */ +static inline struct slab_iterator vdo_iterate_slabs(struct vdo_slab **slabs, + slab_count_t start, + slab_count_t end, + slab_count_t stride) +{ + return (struct slab_iterator) { + .slabs = slabs, + .next = (((slabs == NULL) || (start < end)) ? NULL + : slabs[start]), + .end = end, + .stride = stride, + }; +} + +/** + * vdo_has_next_slab() - Check whether another vdo_slab would be returned by + * the iterator. + * @iterator: The iterator to poll. + * + * Return: true if the next call to vdo_next_slab will return a vdo_slab. + */ +static inline bool vdo_has_next_slab(const struct slab_iterator *iterator) +{ + return (iterator->next != NULL); +} + +/** + * vdo_next_slab() - Get the next vdo_slab, advancing the iterator. + * @iterator: The iterator over the vdo_slab chain. + * + * Return: The next vdo_slab or NULL if the array of slabs is empty or if all + * the appropriate Slabs have been returned. + */ +static inline struct vdo_slab *vdo_next_slab(struct slab_iterator *iterator) +{ + struct vdo_slab *slab = iterator->next; + + if ((slab == NULL) + || (slab->slab_number < iterator->end + iterator->stride)) { + iterator->next = NULL; + } else { + iterator->next = + iterator->slabs[slab->slab_number - iterator->stride]; + } + return slab; +} + +#endif /* SLAB_ITERATOR_H */ diff --git a/vdo/slab-journal-format.c b/vdo/slab-journal-format.c new file mode 100644 index 00000000..e075c85c --- /dev/null +++ b/vdo/slab-journal-format.c @@ -0,0 +1,31 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright Red Hat + */ + +#include "slab-journal-format.h" + +#include "journal-point.h" +#include "types.h" + +/** + * vdo_decode_slab_journal_entry() - Decode a slab journal entry. + * @block: The journal block holding the entry. + * @entry_count: The number of the entry. + * + * Return: The decoded entry. + */ +struct slab_journal_entry +vdo_decode_slab_journal_entry(struct packed_slab_journal_block *block, + journal_entry_count_t entry_count) +{ + struct slab_journal_entry entry = + vdo_unpack_slab_journal_entry(&block->payload.entries[entry_count]); + if (block->header.has_block_map_increments && + ((block->payload.full_entries.entry_types[entry_count / 8] & + ((byte)1 << (entry_count % 8))) != 0)) { + entry.operation = VDO_JOURNAL_BLOCK_MAP_INCREMENT; + } + return entry; +} + diff --git a/vdo/slabJournalFormat.h b/vdo/slab-journal-format.h similarity index 55% rename from vdo/slabJournalFormat.h rename to vdo/slab-journal-format.h index 9c7f0a0d..7fd4cf47 100644 --- a/vdo/slabJournalFormat.h +++ b/vdo/slab-journal-format.h @@ -1,22 +1,6 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/slabJournalFormat.h#11 $ */ #ifndef SLAB_JOURNAL_FORMAT_H @@ -25,24 +9,24 @@ #include "numeric.h" #include "constants.h" -#include "journalPoint.h" +#include "journal-point.h" #include "types.h" -/** +/* * vdo_slab journal blocks may have one of two formats, depending upon whether * or not any of the entries in the block are block map increments. Since the * steady state for a VDO is that all of the necessary block map pages will be * allocated, most slab journal blocks will have only data entries. Such * blocks can hold more entries, hence the two formats. - **/ + */ -/** A single slab journal entry */ +/* A single slab journal entry */ struct slab_journal_entry { slab_block_number sbn; enum journal_operation operation; }; -/** A single slab journal entry in its on-disk form */ +/* A single slab journal entry in its on-disk form */ typedef struct { uint8_t offset_low8; uint8_t offset_mid8; @@ -56,42 +40,42 @@ typedef struct { #endif } __packed packed_slab_journal_entry; -/** The unpacked representation of the header of a slab journal block */ +/* The unpacked representation of the header of a slab journal block */ struct slab_journal_block_header { - /** Sequence number for head of journal */ + /* Sequence number for head of journal */ sequence_number_t head; - /** Sequence number for this block */ + /* Sequence number for this block */ sequence_number_t sequence_number; - /** The nonce for a given VDO instance */ + /* The nonce for a given VDO instance */ nonce_t nonce; - /** Recovery journal point for last entry */ + /* Recovery journal point for last entry */ struct journal_point recovery_point; - /** Metadata type */ + /* Metadata type */ enum vdo_metadata_type metadata_type; - /** Whether this block contains block map increments */ + /* Whether this block contains block map increments */ bool has_block_map_increments; - /** The number of entries in the block */ + /* The number of entries in the block */ journal_entry_count_t entry_count; }; -/** +/* * The packed, on-disk representation of a slab journal block header. * All fields are kept in little-endian byte order. - **/ + */ struct packed_slab_journal_block_header { - /** 64-bit sequence number for head of journal */ + /* 64-bit sequence number for head of journal */ __le64 head; - /** 64-bit sequence number for this block */ + /* 64-bit sequence number for this block */ __le64 sequence_number; - /** Recovery journal point for the last entry, packed into 64 bits */ + /* Recovery journal point for the last entry, packed into 64 bits */ struct packed_journal_point recovery_point; - /** The 64-bit nonce for a given VDO instance */ + /* The 64-bit nonce for a given VDO instance */ __le64 nonce; - /** 8-bit metadata type (should always be two, for the slab journal) */ + /* 8-bit metadata type (should always be two, for the slab journal) */ uint8_t metadata_type; - /** Whether this block contains block map increments */ + /* Whether this block contains block map increments */ bool has_block_map_increments; - /** 16-bit count of the entries encoded in the block */ + /* 16-bit count of the entries encoded in the block */ __le16 entry_count; } __packed; @@ -106,7 +90,7 @@ enum { (VDO_SLAB_JOURNAL_PAYLOAD_SIZE / sizeof(packed_slab_journal_entry)), }; -/** The payload of a slab journal block which has block map increments */ +/* The payload of a slab journal block which has block map increments */ struct full_slab_journal_entries { /* The entries themselves */ packed_slab_journal_entry entries[VDO_SLAB_JOURNAL_FULL_ENTRIES_PER_BLOCK]; @@ -129,14 +113,14 @@ struct packed_slab_journal_block { } __packed; /** - * Get the physical block number of the start of the slab journal - * relative to the start block allocator partition. - * - * @param slab_config The slab configuration of the VDO - * @param origin The first block of the slab - **/ + * vdo_get_slab_journal_start_block() - Get the physical block number of the + * start of the slab journal relative to + * the start block allocator partition. + * @slab_config: The slab configuration of the VDO. + * @origin: The first block of the slab. + */ static inline physical_block_number_t __must_check -get_vdo_slab_journal_start_block(const struct slab_config *slab_config, +vdo_get_slab_journal_start_block(const struct slab_config *slab_config, physical_block_number_t origin) { return origin + slab_config->data_blocks @@ -144,13 +128,13 @@ get_vdo_slab_journal_start_block(const struct slab_config *slab_config, } /** - * Generate the packed representation of a slab block header. - * - * @param header The header containing the values to encode - * @param packed The header into which to pack the values - **/ + * vdo_pack_slab_journal_block_header() - Generate the packed representation + * of a slab block header. + * @header: The header containing the values to encode. + * @packed: The header into which to pack the values. + */ static inline void -pack_vdo_slab_journal_block_header(const struct slab_journal_block_header *header, +vdo_pack_slab_journal_block_header(const struct slab_journal_block_header *header, struct packed_slab_journal_block_header *packed) { packed->head = __cpu_to_le64(header->head); @@ -160,42 +144,35 @@ pack_vdo_slab_journal_block_header(const struct slab_journal_block_header *heade packed->metadata_type = header->metadata_type; packed->has_block_map_increments = header->has_block_map_increments; - pack_vdo_journal_point(&header->recovery_point, + vdo_pack_journal_point(&header->recovery_point, &packed->recovery_point); } /** - * Decode the packed representation of a slab journal entry. + * vdo_unpack_slab_journal_entry() - Decode the packed representation of a + * slab journal entry. + * @packed: The packed entry to decode. * - * @param packed The packed entry to decode - * - * @return The decoded slab journal entry - **/ + * Return: The decoded slab journal entry. + */ static inline struct slab_journal_entry __must_check -unpack_vdo_slab_journal_entry(const packed_slab_journal_entry *packed) +vdo_unpack_slab_journal_entry(const packed_slab_journal_entry *packed) { struct slab_journal_entry entry; + entry.sbn = packed->offset_high7; entry.sbn <<= 8; entry.sbn |= packed->offset_mid8; entry.sbn <<= 8; entry.sbn |= packed->offset_low8; - entry.operation = - (packed->increment ? DATA_INCREMENT : DATA_DECREMENT); + entry.operation = (packed->increment ? VDO_JOURNAL_DATA_INCREMENT + : VDO_JOURNAL_DATA_DECREMENT); return entry; } -/** - * Decode a slab journal entry. - * - * @param block The journal block holding the entry - * @param entry_count The number of the entry - * - * @return The decoded entry - **/ struct slab_journal_entry __must_check -decode_vdo_slab_journal_entry(struct packed_slab_journal_block *block, +vdo_decode_slab_journal_entry(struct packed_slab_journal_block *block, journal_entry_count_t entry_count); -#endif // SLAB_JOURNAL_FORMAT_H +#endif /* SLAB_JOURNAL_FORMAT_H */ diff --git a/vdo/slabJournal.c b/vdo/slab-journal.c similarity index 55% rename from vdo/slabJournal.c rename to vdo/slab-journal.c index 3a90789f..108dd686 100644 --- a/vdo/slabJournal.c +++ b/vdo/slab-journal.c @@ -1,42 +1,36 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/slabJournal.c#37 $ */ -#include "slabJournalInternals.h" +#include "slab-journal.h" + +#include #include "logger.h" -#include "memoryAlloc.h" +#include "memory-alloc.h" #include "permassert.h" -#include "stringUtils.h" - -#include "adminState.h" -#include "blockAllocatorInternals.h" -#include "dataVIO.h" -#include "recoveryJournal.h" -#include "refCounts.h" -#include "slabDepot.h" -#include "slabDepotInternals.h" -#include "slabSummary.h" +#include "string-utils.h" + +#include "admin-state.h" +#include "block-allocator.h" +#include "data-vio.h" +#include "io-submitter.h" +#include "recovery-journal.h" +#include "ref-counts.h" +#include "slab-depot.h" +#include "slab-summary.h" #include "vdo.h" +#include "vio.h" -/**********************************************************************/ +/** + * vdo_slab_journal_from_dirty_entry() - Obtain a pointer to a slab_journal + * structure from a pointer to the dirty + * list entry field within it. + * @entry: The list entry to convert. + * + * Return: The entry as a slab_journal. + */ struct slab_journal *vdo_slab_journal_from_dirty_entry(struct list_head *entry) { if (entry == NULL) { @@ -46,103 +40,106 @@ struct slab_journal *vdo_slab_journal_from_dirty_entry(struct list_head *entry) } /** - * Get the physical block number for a given sequence number. + * get_block_number() - Get the physical block number for a given sequence + * number. + * @journal: The journal. + * @sequence: The sequence number of the desired block. * - * @param journal The journal - * @param sequence The sequence number of the desired block - * - * @return the block number corresponding to the sequence number - **/ + * Return: The block number corresponding to the sequence number. + */ static inline physical_block_number_t __must_check get_block_number(struct slab_journal *journal, sequence_number_t sequence) { - tail_block_offset_t offset = get_vdo_slab_journal_block_offset(journal, + tail_block_offset_t offset = vdo_get_slab_journal_block_offset(journal, sequence); return (journal->slab->journal_origin + offset); } /** - * Get the lock object for a slab journal block by sequence number. - * - * @param journal vdo_slab journal to retrieve from - * @param sequence_number Sequence number of the block + * get_lock() - Get the lock object for a slab journal block by sequence + * number. + * @journal: vdo_slab journal to retrieve from. + * @sequence_number: Sequence number of the block. * - * @return the lock object for the given sequence number - **/ + * Return: The lock object for the given sequence number. + */ static inline struct journal_lock * __must_check get_lock(struct slab_journal *journal, sequence_number_t sequence_number) { tail_block_offset_t offset = - get_vdo_slab_journal_block_offset(journal, sequence_number); + vdo_get_slab_journal_block_offset(journal, sequence_number); return &journal->locks[offset]; } /** - * Check whether the VDO is in read-only mode. - * - * @param journal The journal whose owning VDO should be checked + * is_vdo_read_only() - Check whether the VDO is in read-only mode. + * @journal: The journal whose owning VDO should be checked. * - * @return true if the VDO is in read-only mode - **/ + * Return: true if the VDO is in read-only mode. + */ static inline bool __must_check is_vdo_read_only(struct slab_journal *journal) { return vdo_is_read_only(journal->slab->allocator->read_only_notifier); } /** - * Check whether there are entry waiters which should delay a flush. + * must_make_entries_to_flush() - Check whether there are entry waiters which + * should delay a flush. + * @journal: The journal to check. * - * @param journal The journal to check - * - * @return true if there are no entry waiters, or if the slab - * is unrecovered - **/ + * Return: true if there are no entry waiters, or if the slab is unrecovered. + */ static inline bool __must_check must_make_entries_to_flush(struct slab_journal *journal) { - return (!is_vdo_slab_rebuilding(journal->slab) && + return (!vdo_is_slab_rebuilding(journal->slab) && has_waiters(&journal->entry_waiters)); } /** - * Check whether a reap is currently in progress. - * - * @param journal The journal which may be reaping + * is_reaping() - Check whether a reap is currently in progress. + * @journal: The journal which may be reaping. * - * @return true if the journal is reaping - **/ + * Return: true if the journal is reaping. + */ static inline bool __must_check is_reaping(struct slab_journal *journal) { return (journal->head != journal->unreapable); } -/**********************************************************************/ -bool is_vdo_slab_journal_active(struct slab_journal *journal) +/** + * vdo_is_slab_journal_active() - Check whether a slab journal is active. + * @journal: The slab journal to check. + * + * Return: true if the journal is active. + */ +bool vdo_is_slab_journal_active(struct slab_journal *journal) { - return (must_make_entries_to_flush(journal) || is_reaping(journal) || + return (must_make_entries_to_flush(journal) || + is_reaping(journal) || journal->waiting_to_commit || !list_empty(&journal->uncommitted_blocks) || journal->updating_slab_summary); } /** - * Initialize tail block as a new block. - * - * @param journal The journal whose tail block is being initialized - **/ + * initialize_tail_block() - Initialize tail block as a new block. + * @journal: The journal whose tail block is being initialized. + */ static void initialize_tail_block(struct slab_journal *journal) { struct slab_journal_block_header *header = &journal->tail_header; + header->sequence_number = journal->tail; header->entry_count = 0; header->has_block_map_increments = false; } /** - * Set all journal fields appropriately to start journaling. - * - * @param journal The journal to be reset, based on its tail sequence number - **/ + * initialize_journal_state() - Set all journal fields appropriately to start + * journaling. + * @journal: The journal to be reset, based on its tail sequence number. + */ static void initialize_journal_state(struct slab_journal *journal) { journal->unreapable = journal->head; @@ -153,34 +150,41 @@ static void initialize_journal_state(struct slab_journal *journal) } /** - * Check whether a journal block is full. + * block_is_full() - Check whether a journal block is full. + * @journal: The slab journal for the block. * - * @param journal The slab journal for the block - * - * @return true if the tail block is full - **/ + * Return: true if the tail block is full. + */ static bool __must_check block_is_full(struct slab_journal *journal) { journal_entry_count_t count = journal->tail_header.entry_count; + return (journal->tail_header.has_block_map_increments ? (journal->full_entries_per_block == count) : (journal->entries_per_block == count)); } -/**********************************************************************/ static void add_entries(struct slab_journal *journal); static void update_tail_block_location(struct slab_journal *journal); static void release_journal_locks(struct waiter *waiter, void *context); -/**********************************************************************/ -int make_vdo_slab_journal(struct block_allocator *allocator, +/** + * vdo_make_slab_journal() - Create a slab journal. + * @allocator: The block allocator which owns this journal. + * @slab: The parent slab of the journal. + * @recovery_journal: The recovery journal of the VDO. + * @journal_ptr: The pointer to hold the new slab journal. + * + * Return: VDO_SUCCESS or error code. + */ +int vdo_make_slab_journal(struct block_allocator *allocator, struct vdo_slab *slab, struct recovery_journal *recovery_journal, struct slab_journal **journal_ptr) { struct slab_journal *journal; const struct slab_config *slab_config = - get_vdo_slab_config(allocator->depot); + vdo_get_slab_config(allocator->depot); int result = UDS_ALLOCATE_EXTENDED(struct slab_journal, slab_config->slab_journal_blocks, struct journal_lock, @@ -202,13 +206,15 @@ int make_vdo_slab_journal(struct block_allocator *allocator, journal->full_entries_per_block = VDO_SLAB_JOURNAL_FULL_ENTRIES_PER_BLOCK; journal->events = &allocator->slab_journal_statistics; journal->recovery_journal = recovery_journal; - journal->summary = get_vdo_slab_summary_zone(allocator); + journal->summary = allocator->summary; journal->tail = 1; journal->head = 1; journal->flushing_deadline = journal->flushing_threshold; - // Set there to be some time between the deadline and the blocking - // threshold, so that hopefully all are done before blocking. + /* + * Set there to be some time between the deadline and the blocking + * threshold, so that hopefully all are done before blocking. + */ if ((journal->blocking_threshold - journal->flushing_threshold) > 5) { journal->flushing_deadline = journal->blocking_threshold - 5; } @@ -220,7 +226,7 @@ int make_vdo_slab_journal(struct block_allocator *allocator, "struct packed_slab_journal_block", (char **)&journal->block); if (result != VDO_SUCCESS) { - free_vdo_slab_journal(journal); + vdo_free_slab_journal(journal); return result; } @@ -235,8 +241,11 @@ int make_vdo_slab_journal(struct block_allocator *allocator, return VDO_SUCCESS; } -/**********************************************************************/ -void free_vdo_slab_journal(struct slab_journal *journal) +/** + * vdo_free_slab_journal() - Free a slab journal. + * @journal: The slab journal to free. + */ +void vdo_free_slab_journal(struct slab_journal *journal) { if (journal == NULL) { return; @@ -246,32 +255,46 @@ void free_vdo_slab_journal(struct slab_journal *journal) UDS_FREE(journal); } -/**********************************************************************/ -bool is_vdo_slab_journal_blank(const struct slab_journal *journal) +/** + * vdo_is_slab_journal_blank() - Check whether a slab journal is blank. + * @journal: The journal to query. + * + * A slab journal is blank if it has never had any entries recorded in it. + * + * Return: true if the slab journal has never been modified. + */ +bool vdo_is_slab_journal_blank(const struct slab_journal *journal) { return ((journal != NULL) && (journal->tail == 1) && (journal->tail_header.entry_count == 0)); } -/**********************************************************************/ -bool is_vdo_slab_journal_dirty(const struct slab_journal *journal) +/** + * vdo_is_slab_journal_dirty() - Check whether the slab journal is on the + * block allocator's list of dirty journals. + * @journal: The journal to query. + * + * Return: true if the journal has been added to the dirty list. + */ +static bool +vdo_is_slab_journal_dirty(const struct slab_journal *journal) { return (journal->recovery_lock != 0); } /** - * Put a slab journal on the dirty ring of its allocator in the correct order. - * - * @param journal The journal to be marked dirty - * @param lock The recovery journal lock held by the slab journal - **/ + * mark_slab_journal_dirty() - Put a slab journal on the dirty ring of its + * allocator in the correct order. + * @journal: The journal to be marked dirty. + * @lock: The recovery journal lock held by the slab journal. + */ static void mark_slab_journal_dirty(struct slab_journal *journal, sequence_number_t lock) { struct list_head *entry; struct list_head *dirty_list = &journal->slab->allocator->dirty_slab_journals; - ASSERT_LOG_ONLY(!is_vdo_slab_journal_dirty(journal), + ASSERT_LOG_ONLY(!vdo_is_slab_journal_dirty(journal), "slab journal was clean"); journal->recovery_lock = lock; @@ -286,7 +309,6 @@ static void mark_slab_journal_dirty(struct slab_journal *journal, list_move_tail(&journal->dirty_entry, entry->next); } -/**********************************************************************/ static void mark_slab_journal_clean(struct slab_journal *journal) { journal->recovery_lock = 0; @@ -294,92 +316,108 @@ static void mark_slab_journal_clean(struct slab_journal *journal) } /** - * Implements waiter_callback. This callback is invoked on all vios waiting - * to make slab journal entries after the VDO has gone into read-only mode. - **/ + * abort_waiter() - Abort vios waiting to make journal entries when read-only. + * + * This callback is invoked on all vios waiting to make slab journal entries + * after the VDO has gone into read-only mode. Implements waiter_callback. + */ static void abort_waiter(struct waiter *waiter, void *context __always_unused) { continue_data_vio(waiter_as_data_vio(waiter), VDO_READ_ONLY); } -/**********************************************************************/ -void abort_vdo_slab_journal_waiters(struct slab_journal *journal) +/** + * vdo_abort_slab_journal_waiters() - Abort any VIOs waiting to make slab + * journal entries. + * @journal: The journal to abort. + */ +void vdo_abort_slab_journal_waiters(struct slab_journal *journal) { ASSERT_LOG_ONLY((vdo_get_callback_thread_id() == journal->slab->allocator->thread_id), - "abort_vdo_slab_journal_waiters() called on correct thread"); + "vdo_abort_slab_journal_waiters() called on correct thread"); notify_all_waiters(&journal->entry_waiters, abort_waiter, journal); - check_if_vdo_slab_drained(journal->slab); + vdo_check_if_slab_drained(journal->slab); } /** - * Put the journal in read-only mode. All attempts to add entries after - * this function is called will fail. All vios waiting for to make entries - * will be awakened with an error. All flushes will complete as soon as all - * pending IO is done. + * enter_journal_read_only_mode() - Put the journal in read-only mode. + * @journal: The journal which has failed. + * @error_code: The error result triggering this call. * - * @param journal The journal which has failed - * @param error_code The error result triggering this call - **/ + * All attempts to add entries after this function is called will fail. All + * vios waiting for to make entries will be awakened with an error. All + * flushes will complete as soon as all pending IO is done. + */ static void enter_journal_read_only_mode(struct slab_journal *journal, int error_code) { vdo_enter_read_only_mode(journal->slab->allocator->read_only_notifier, error_code); - abort_vdo_slab_journal_waiters(journal); + vdo_abort_slab_journal_waiters(journal); } /** - * Actually advance the head of the journal now that any necessary flushes - * are complete. - * - * @param journal The journal to be reaped - **/ + * finish_reaping() - Actually advance the head of the journal now that any + * necessary flushes are complete. + * @journal: The journal to be reaped. + */ static void finish_reaping(struct slab_journal *journal) { journal->head = journal->unreapable; add_entries(journal); - check_if_vdo_slab_drained(journal->slab); + vdo_check_if_slab_drained(journal->slab); } -/**********************************************************************/ static void reap_slab_journal(struct slab_journal *journal); /** - * Finish reaping now that we have flushed the lower layer and then try - * reaping again in case we deferred reaping due to an outstanding vio. - * - * @param completion The flush vio - **/ + * complete_reaping() - Finish reaping now that we have flushed the lower + * layer and then try reaping again in case we deferred + * reaping due to an outstanding vio. + * @completion: The flush vio. + */ static void complete_reaping(struct vdo_completion *completion) { struct vio_pool_entry *entry = completion->parent; struct slab_journal *journal = entry->parent; - return_vdo_block_allocator_vio(journal->slab->allocator, entry); + + vdo_return_block_allocator_vio(journal->slab->allocator, entry); finish_reaping(journal); reap_slab_journal(journal); } /** - * Handle an error flushing the lower layer. - * - * @param completion The flush vio - **/ + * handle_flush_error() - Handle an error flushing the lower layer. + * @completion: The flush vio. + */ static void handle_flush_error(struct vdo_completion *completion) { struct slab_journal *journal = ((struct vio_pool_entry *)completion->parent)->parent; + + record_metadata_io_error(as_vio(completion)); enter_journal_read_only_mode(journal, completion->result); complete_reaping(completion); } +static void flush_endio(struct bio *bio) +{ + struct vio *vio = bio->bi_private; + struct vio_pool_entry *entry = vio->completion.parent; + struct slab_journal *journal = entry->parent; + + continue_vio_after_io(vio, + complete_reaping, + journal->slab->allocator->thread_id); +} + /** - * A waiter callback for getting a vio with which to flush the lower - * layer prior to reaping. - * - * @param waiter The journal as a flush waiter - * @param vio_context The newly acquired flush vio - **/ + * flush_for_reaping() - A waiter callback for getting a vio with which to + * flush the lower layer prior to reaping. + * @waiter: The journal as a flush waiter. + * @vio_context: The newly acquired flush vio. + */ static void flush_for_reaping(struct waiter *waiter, void *vio_context) { struct slab_journal *journal = @@ -388,31 +426,31 @@ static void flush_for_reaping(struct waiter *waiter, void *vio_context) struct vio *vio = entry->vio; entry->parent = journal; - vio->completion.callback_thread_id = - journal->slab->allocator->thread_id; - launch_flush_vio(vio, complete_reaping, handle_flush_error); + submit_flush_vio(vio, flush_endio, handle_flush_error); } /** - * Conduct a reap on a slab journal to reclaim unreferenced blocks. - * - * @param journal The slab journal - **/ + * reap_slab_journal() - Conduct a reap on a slab journal to reclaim + * unreferenced blocks. + * @journal: The slab journal. + */ static void reap_slab_journal(struct slab_journal *journal) { bool reaped = false; int result; if (is_reaping(journal)) { - // We already have a reap in progress so wait for it to finish. + /* We already have a reap in progress so wait for it to finish. */ return; } - if (is_unrecovered_vdo_slab(journal->slab) || - !is_vdo_state_normal(&journal->slab->state) || + if (vdo_is_unrecovered_slab(journal->slab) || + !vdo_is_state_normal(&journal->slab->state) || is_vdo_read_only(journal)) { - // We must not reap in the first two cases, and there's no - // point in read-only mode. + /* + * We must not reap in the first two cases, and there's no + * point in read-only mode. + */ return; } @@ -450,7 +488,7 @@ static void reap_slab_journal(struct slab_journal *journal) * (VDO-2912). */ journal->flush_waiter.callback = flush_for_reaping; - result = acquire_vdo_block_allocator_vio(journal->slab->allocator, + result = vdo_acquire_block_allocator_vio(journal->slab->allocator, &journal->flush_waiter); if (result != VDO_SUCCESS) { enter_journal_read_only_mode(journal, result); @@ -459,24 +497,28 @@ static void reap_slab_journal(struct slab_journal *journal) } /** - * This is the callback invoked after a slab summary update completes. It - * is registered in the constructor on behalf of update_tail_block_location(). + * release_journal_locks() - Callback invoked after a slab summary update + * completes. + * @waiter: The slab summary waiter that has just been notified. + * @context: The result code of the update. * - * Implements waiter_callback. + * Registered in the constructor on behalf of update_tail_block_location(). * - * @param waiter The slab summary waiter that has just been notified - * @param context The result code of the update - **/ + * Implements waiter_callback. + */ static void release_journal_locks(struct waiter *waiter, void *context) { sequence_number_t first, i; struct slab_journal *journal = container_of(waiter, struct slab_journal, slab_summary_waiter); int result = *((int *)context); + if (result != VDO_SUCCESS) { if (result != VDO_READ_ONLY) { - // Don't bother logging what might be lots of errors if - // we are already in read-only mode. + /* + * Don't bother logging what might be lots of errors if + * we are already in read-only mode. + */ uds_log_error_strerror(result, "failed slab summary update %llu", (unsigned long long) journal->summarized); @@ -496,36 +538,40 @@ static void release_journal_locks(struct waiter *waiter, void *context) first = journal->last_summarized; journal->last_summarized = journal->summarized; for (i = journal->summarized - 1; i >= first; i--) { - // Release the lock the summarized block held on the recovery - // journal. (During replay, recovery_start will always be 0.) + /* + * Release the lock the summarized block held on the recovery + * journal. (During replay, recovery_start will always be 0.) + */ if (journal->recovery_journal != NULL) { zone_count_t zone_number = journal->slab->allocator->zone_number; - release_vdo_recovery_journal_block_reference(journal->recovery_journal, + vdo_release_recovery_journal_block_reference(journal->recovery_journal, get_lock(journal, i)->recovery_start, - ZONE_TYPE_PHYSICAL, - zone_number); + VDO_ZONE_TYPE_PHYSICAL, + zone_number); } - // Release our own lock against reaping for blocks that are - // committed. (This function will not change locks during - // replay.) - adjust_vdo_slab_journal_block_reference(journal, i, -1); + /* + * Release our own lock against reaping for blocks that are + * committed. (This function will not change locks during + * replay.) + */ + vdo_adjust_slab_journal_block_reference(journal, i, -1); } journal->updating_slab_summary = false; reap_slab_journal(journal); - // Check if the slab summary needs to be updated again. + /* Check if the slab summary needs to be updated again. */ update_tail_block_location(journal); } /** - * Update the tail block location in the slab summary, if necessary. - * - * @param journal The slab journal that is updating its tail block location - **/ + * update_tail_block_location() - Update the tail block location in the slab + * summary, if necessary. + * @journal: The slab journal that is updating its tail block location. + */ static void update_tail_block_location(struct slab_journal *journal) { block_count_t free_block_count; @@ -533,14 +579,14 @@ static void update_tail_block_location(struct slab_journal *journal) if (journal->updating_slab_summary || is_vdo_read_only(journal) || (journal->last_summarized >= journal->next_commit)) { - check_if_vdo_slab_drained(journal->slab); + vdo_check_if_slab_drained(journal->slab); return; } - if (is_unrecovered_vdo_slab(journal->slab)) { + if (vdo_is_unrecovered_slab(journal->slab)) { free_block_count = - get_summarized_free_block_count(journal->summary, - journal->slab->slab_number); + vdo_get_summarized_free_block_count(journal->summary, + journal->slab->slab_number); } else { free_block_count = get_slab_free_block_count(journal->slab); } @@ -556,7 +602,7 @@ static void update_tail_block_location(struct slab_journal *journal) * has reaped past sequence number 1. */ block_offset = - get_vdo_slab_journal_block_offset(journal, journal->summarized); + vdo_get_slab_journal_block_offset(journal, journal->summarized); vdo_update_slab_summary_entry(journal->summary, &journal->slab_summary_waiter, journal->slab->slab_number, @@ -566,8 +612,12 @@ static void update_tail_block_location(struct slab_journal *journal) free_block_count); } -/**********************************************************************/ -void reopen_vdo_slab_journal(struct slab_journal *journal) +/** + * vdo_reopen_slab_journal() - Reopen a slab journal by emptying it and then + * adding any pending entries. + * @journal: The journal to reopen. + */ +void vdo_reopen_slab_journal(struct slab_journal *journal) { sequence_number_t block; @@ -576,7 +626,7 @@ void reopen_vdo_slab_journal(struct slab_journal *journal) journal->head = journal->tail; initialize_journal_state(journal); - // Ensure no locks are spuriously held on an empty journal. + /* Ensure no locks are spuriously held on an empty journal. */ for (block = 1; block <= journal->size; block++) { ASSERT_LOG_ONLY((get_lock(journal, block)->count == 0), "Scrubbed journal's block %llu is not locked", @@ -586,31 +636,32 @@ void reopen_vdo_slab_journal(struct slab_journal *journal) add_entries(journal); } -/**********************************************************************/ static sequence_number_t get_committing_sequence_number(const struct vio_pool_entry *entry) { const struct packed_slab_journal_block *block = entry->buffer; + return __le64_to_cpu(block->header.sequence_number); } /** - * Handle post-commit processing. This is the callback registered by - * write_slab_journal_block(). + * complete_write() - Handle post-commit processing. + * @completion: The write vio as a completion. * - * @param completion The write vio as a completion - **/ + * This is the callback registered by write_slab_journal_block(). + */ static void complete_write(struct vdo_completion *completion) { int write_result = completion->result; struct vio_pool_entry *entry = completion->parent; struct slab_journal *journal = entry->parent; - sequence_number_t committed = get_committing_sequence_number(entry); + list_del_init(&entry->available_entry); - return_vdo_block_allocator_vio(journal->slab->allocator, entry); + vdo_return_block_allocator_vio(journal->slab->allocator, entry); if (write_result != VDO_SUCCESS) { + record_metadata_io_error(as_vio(completion)); uds_log_error_strerror(write_result, "cannot write slab journal block %llu", (unsigned long long) committed); @@ -622,12 +673,16 @@ static void complete_write(struct vdo_completion *completion) journal->events->blocks_written + 1); if (list_empty(&journal->uncommitted_blocks)) { - // If no blocks are outstanding, then the commit point is at - // the tail. + /* + * If no blocks are outstanding, then the commit point is at + * the tail. + */ journal->next_commit = journal->tail; } else { - // The commit point is always the beginning of the oldest - // incomplete block. + /* + * The commit point is always the beginning of the oldest + * incomplete block. + */ struct vio_pool_entry *oldest = as_vio_pool_entry(journal->uncommitted_blocks.next); journal->next_commit = get_committing_sequence_number(oldest); @@ -636,13 +691,25 @@ static void complete_write(struct vdo_completion *completion) update_tail_block_location(journal); } +static void write_slab_journal_endio(struct bio *bio) +{ + struct vio *vio = bio->bi_private; + struct vio_pool_entry *entry = vio->completion.parent; + struct slab_journal *journal = entry->parent; + + continue_vio_after_io(vio, + complete_write, + journal->slab->allocator->thread_id); +} + /** - * Callback from acquire_vdo_block_allocator_vio() registered in - * commit_vdo_slab_journal_tail(). + * write_slab_journal_block() - Write a slab journal block. + * @waiter: The vio pool waiter which was just notified. + * @vio_context: The vio pool entry for the write. * - * @param waiter The vio pool waiter which was just notified - * @param vio_context The vio pool entry for the write - **/ + * Callback from vdo_acquire_block_allocator_vio() registered in + * commit_tail(). + */ static void write_slab_journal_block(struct waiter *waiter, void *vio_context) { struct slab_journal *journal = @@ -655,43 +722,49 @@ static void write_slab_journal_block(struct waiter *waiter, void *vio_context) header->head = journal->head; list_move_tail(&entry->available_entry, &journal->uncommitted_blocks); - pack_vdo_slab_journal_block_header(header, &journal->block->header); + vdo_pack_slab_journal_block_header(header, &journal->block->header); - // Copy the tail block into the vio. + /* Copy the tail block into the vio. */ memcpy(entry->buffer, journal->block, VDO_BLOCK_SIZE); ASSERT_LOG_ONLY(unused_entries >= 0, "vdo_slab journal block is not overfull"); if (unused_entries > 0) { - // Release the per-entry locks for any unused entries in the - // block we are about to write. - adjust_vdo_slab_journal_block_reference(journal, + /* + * Release the per-entry locks for any unused entries in the + * block we are about to write. + */ + vdo_adjust_slab_journal_block_reference(journal, header->sequence_number, -unused_entries); journal->partial_write_in_progress = !block_is_full(journal); } block_number = get_block_number(journal, header->sequence_number); - entry->parent = journal; - entry->vio->completion.callback_thread_id = - journal->slab->allocator->thread_id; + /* * This block won't be read in recovery until the slab summary is * updated to refer to it. The slab summary update does a flush which * is sufficient to protect us from VDO-2331. */ - launch_write_metadata_vio(entry->vio, block_number, complete_write, - complete_write); + submit_metadata_vio(entry->vio, + block_number, + write_slab_journal_endio, + complete_write, + REQ_OP_WRITE); - // Since the write is submitted, the tail block structure can be reused. + /* + * Since the write is submitted, the tail block structure can be + * reused. + */ journal->tail++; initialize_tail_block(journal); journal->waiting_to_commit = false; - operation = get_vdo_admin_state_code(&journal->slab->state); + operation = vdo_get_admin_state_code(&journal->slab->state); if (operation == VDO_ADMIN_STATE_WAITING_FOR_RECOVERY) { - finish_vdo_operation(&journal->slab->state, + vdo_finish_operation(&journal->slab->state, (is_vdo_read_only(journal) ? VDO_READ_ONLY : VDO_SUCCESS)); return; @@ -700,23 +773,30 @@ static void write_slab_journal_block(struct waiter *waiter, void *vio_context) add_entries(journal); } -/**********************************************************************/ -void commit_vdo_slab_journal_tail(struct slab_journal *journal) +/** + * commit_tail() - Commit the tail block of the slab journal. + * @journal: The journal whose tail block should be committed. + */ +static void commit_tail(struct slab_journal *journal) { int result; if ((journal->tail_header.entry_count == 0) && must_make_entries_to_flush(journal)) { - // There are no entries at the moment, but there are some - // waiters, so defer initiating the flush until those entries - // are ready to write. + /* + * There are no entries at the moment, but there are some + * waiters, so defer initiating the flush until those entries + * are ready to write. + */ return; } if (is_vdo_read_only(journal) || journal->waiting_to_commit || (journal->tail_header.entry_count == 0)) { - // There is nothing to do since the tail block is empty, or - // writing, or the journal is in read-only mode. + /* + * There is nothing to do since the tail block is empty, or + * writing, or the journal is in read-only mode. + */ return; } @@ -730,7 +810,7 @@ void commit_vdo_slab_journal_tail(struct slab_journal *journal) journal->waiting_to_commit = true; journal->resource_waiter.callback = write_slab_journal_block; - result = acquire_vdo_block_allocator_vio(journal->slab->allocator, + result = vdo_acquire_block_allocator_vio(journal->slab->allocator, &journal->resource_waiter); if (result != VDO_SUCCESS) { journal->waiting_to_commit = false; @@ -739,14 +819,24 @@ void commit_vdo_slab_journal_tail(struct slab_journal *journal) } } -/**********************************************************************/ -void encode_vdo_slab_journal_entry(struct slab_journal_block_header *tail_header, - slab_journal_payload *payload, - slab_block_number sbn, - enum journal_operation operation) +/** + * vdo_encode_slab_journal_entry() - Encode a slab journal entry. + * @tail_header: The unpacked header for the block. + * @payload: The journal block payload to hold the entry. + * @sbn: The slab block number of the entry to encode. + * @operation: The type of the entry. + * + * Exposed for unit tests. + */ +static void +vdo_encode_slab_journal_entry(struct slab_journal_block_header *tail_header, + slab_journal_payload *payload, + slab_block_number sbn, + enum journal_operation operation) { journal_entry_count_t entry_number = tail_header->entry_count++; - if (operation == BLOCK_MAP_INCREMENT) { + + if (operation == VDO_JOURNAL_BLOCK_MAP_INCREMENT) { if (!tail_header->has_block_map_increments) { memset(payload->full_entries.entry_types, 0, @@ -758,20 +848,21 @@ void encode_vdo_slab_journal_entry(struct slab_journal_block_header *tail_header ((byte)1 << (entry_number % 8)); } - pack_vdo_slab_journal_entry(&payload->entries[entry_number], + vdo_pack_slab_journal_entry(&payload->entries[entry_number], sbn, - is_vdo_journal_increment_operation(operation)); + vdo_is_journal_increment_operation(operation)); } /** - * Actually add an entry to the slab journal, potentially firing off a write - * if a block becomes full. This function is synchronous. + * add_entry() - Actually add an entry to the slab journal, potentially firing + * off a write if a block becomes full. + * @journal: The slab journal to append to. + * @pbn: The pbn being adjusted. + * @operation: The type of entry to make. + * @recovery_point: The recovery journal point for this entry. * - * @param journal The slab journal to append to - * @param pbn The pbn being adjusted - * @param operation The type of entry to make - * @param recovery_point The recovery journal point for this entry - **/ + * This function is synchronous. + */ static void add_entry(struct slab_journal *journal, physical_block_number_t pbn, enum journal_operation operation, @@ -780,7 +871,7 @@ static void add_entry(struct slab_journal *journal, struct packed_slab_journal_block *block = journal->block; int result = - ASSERT(before_vdo_journal_point(&journal->tail_header.recovery_point, + ASSERT(vdo_before_journal_point(&journal->tail_header.recovery_point, recovery_point), "recovery journal point is monotonically increasing, recovery point: %llu.%u, block recovery point: %llu.%u", (unsigned long long) recovery_point->sequence_number, @@ -792,7 +883,7 @@ static void add_entry(struct slab_journal *journal, return; } - if (operation == BLOCK_MAP_INCREMENT) { + if (operation == VDO_JOURNAL_BLOCK_MAP_INCREMENT) { result = ASSERT_LOG_ONLY((journal->tail_header.entry_count < journal->full_entries_per_block), "block has room for full entries"); @@ -802,18 +893,29 @@ static void add_entry(struct slab_journal *journal, } } - encode_vdo_slab_journal_entry(&journal->tail_header, + vdo_encode_slab_journal_entry(&journal->tail_header, &block->payload, pbn - journal->slab->start, operation); journal->tail_header.recovery_point = *recovery_point; if (block_is_full(journal)) { - commit_vdo_slab_journal_tail(journal); + commit_tail(journal); } } -/**********************************************************************/ -bool attempt_replay_into_vdo_slab_journal(struct slab_journal *journal, +/** + * vdo_attempt_replay_into_slab_journal() - Attempt to replay a recovery + * journal entry into a slab journal. + * @journal: The slab journal to use. + * @pbn: The PBN for the entry. + * @operation: The type of entry to add. + * @recovery_point: The recovery journal point corresponding to this entry. + * @parent: The completion to notify when there is space to add the entry if + * the entry could not be added immediately. + * + * Return: true if the entry was added immediately. + */ +bool vdo_attempt_replay_into_slab_journal(struct slab_journal *journal, physical_block_number_t pbn, enum journal_operation operation, struct journal_point *recovery_point, @@ -821,22 +923,24 @@ bool attempt_replay_into_vdo_slab_journal(struct slab_journal *journal, { struct slab_journal_block_header *header = &journal->tail_header; - // Only accept entries after the current recovery point. - if (!before_vdo_journal_point(&journal->tail_header.recovery_point, + /* Only accept entries after the current recovery point. */ + if (!vdo_before_journal_point(&journal->tail_header.recovery_point, recovery_point)) { return true; } if ((header->entry_count >= journal->full_entries_per_block) && (header->has_block_map_increments || - (operation == BLOCK_MAP_INCREMENT))) { - // The tail block does not have room for the entry we are - // attempting to add so commit the tail block now. - commit_vdo_slab_journal_tail(journal); + (operation == VDO_JOURNAL_BLOCK_MAP_INCREMENT))) { + /* + * The tail block does not have room for the entry we are + * attempting to add so commit the tail block now. + */ + commit_tail(journal); } if (journal->waiting_to_commit) { - start_vdo_operation_with_waiter(&journal->slab->state, + vdo_start_operation_with_waiter(&journal->slab->state, VDO_ADMIN_STATE_WAITING_FOR_RECOVERY, parent, NULL); @@ -854,52 +958,62 @@ bool attempt_replay_into_vdo_slab_journal(struct slab_journal *journal, journal->unreapable++; } - mark_vdo_slab_replaying(journal->slab); + vdo_mark_slab_replaying(journal->slab); add_entry(journal, pbn, operation, recovery_point); return true; } /** - * Check whether the journal should be saving reference blocks out. + * requires_flushing() - Check whether the journal should be saving reference + * blocks out. + * @journal: The journal to check. * - * @param journal The journal to check - * - * @return true if the journal should be requesting reference block writes - **/ + * Return: true if the journal should be requesting reference block writes. + */ static bool requires_flushing(const struct slab_journal *journal) { block_count_t journal_length = (journal->tail - journal->head); + return (journal_length >= journal->flushing_threshold); } /** - * Check whether the journal must be reaped before adding new entries. - * - * @param journal The journal to check + * requires_reaping() - Check whether the journal must be reaped before adding + * new entries. + * @journal: The journal to check. * - * @return true if the journal must be reaped - **/ + * Return: true if the journal must be reaped. + */ static bool requires_reaping(const struct slab_journal *journal) { block_count_t journal_length = (journal->tail - journal->head); + return (journal_length >= journal->blocking_threshold); } -/**********************************************************************/ +/** + * vdo_slab_journal_requires_scrubbing() - Check to see if the journal should + * be scrubbed. + * @journal: The slab journal. + * + * Return: true if the journal requires scrubbing. + */ bool vdo_slab_journal_requires_scrubbing(const struct slab_journal *journal) { block_count_t journal_length = (journal->tail - journal->head); + return (journal_length >= journal->scrubbing_threshold); } /** - * Implements waiter_callback. This callback is invoked by add_entries() once - * it has determined that we are ready to make another entry in the slab - * journal. + * add_entry_from_waiter() - Add an entry to the slab journal. + * @waiter: The vio which should make an entry now. + * @context: The slab journal to make an entry in. * - * @param waiter The vio which should make an entry now - * @param context The slab journal to make an entry in - **/ + * This callback is invoked by add_entries() once it has determined that we + * are ready to make another entry in the slab journal. Implements + * waiter_callback. + */ static void add_entry_from_waiter(struct waiter *waiter, void *context) { int result; @@ -924,19 +1038,22 @@ static void add_entry_from_waiter(struct waiter *waiter, void *context) if (journal->recovery_journal != NULL) { zone_count_t zone_number = journal->slab->allocator->zone_number; - acquire_vdo_recovery_journal_block_reference(journal->recovery_journal, + vdo_acquire_recovery_journal_block_reference(journal->recovery_journal, recovery_block, - ZONE_TYPE_PHYSICAL, + VDO_ZONE_TYPE_PHYSICAL, zone_number); } mark_slab_journal_dirty(journal, recovery_block); - // If the slab journal is over the first threshold, tell the - // ref_counts to write some reference blocks soon. + /* + * If the slab journal is over the first threshold, tell the + * ref_counts to write some reference blocks soon. + */ if (requires_flushing(journal)) { block_count_t journal_length = (journal->tail - journal->head); block_count_t blocks_to_deadline = 0; + WRITE_ONCE(journal->events->flush_count, journal->events->flush_count + 1); if (journal_length <= journal->flushing_deadline) { @@ -953,68 +1070,77 @@ static void add_entry_from_waiter(struct waiter *waiter, void *context) data_vio->operation.type, &data_vio->recovery_journal_point); - // Now that an entry has been made in the slab journal, update the - // reference counts. - result = modify_vdo_slab_reference_count(journal->slab, + /* + * Now that an entry has been made in the slab journal, update the + * reference counts. + */ + result = vdo_modify_slab_reference_count(journal->slab, &slab_journal_point, data_vio->operation); continue_data_vio(data_vio, result); } /** - * Check whether the next entry to be made is a block map increment. - * - * @param journal The journal + * is_next_entry_a_block_map_increment() - Check whether the next entry to be + * made is a block map increment. + * @journal: The journal. * - * @return true if the first entry waiter's operation is a block - * map increment - **/ + * Return: true if the first entry waiter's operation is a block map increment. + */ static inline bool is_next_entry_a_block_map_increment(struct slab_journal *journal) { struct data_vio *data_vio = waiter_as_data_vio(get_first_waiter(&journal->entry_waiters)); - return (data_vio->operation.type == BLOCK_MAP_INCREMENT); + return (data_vio->operation.type == VDO_JOURNAL_BLOCK_MAP_INCREMENT); } /** - * Add as many entries as possible from the queue of vios waiting to make - * entries. By processing the queue in order, we ensure that slab journal - * entries are made in the same order as recovery journal entries for the - * same increment or decrement. + * add_entries() - Add as many entries as possible from the queue of vios + * waiting to make entries. + * @journal: The journal to which entries may be added. * - * @param journal The journal to which entries may be added - **/ + * By processing the queue in order, we ensure that slab journal entries are + * made in the same order as recovery journal entries for the same increment + * or decrement. + */ static void add_entries(struct slab_journal *journal) { if (journal->adding_entries) { - // Protect against re-entrancy. + /* Protect against re-entrancy. */ return; } journal->adding_entries = true; while (has_waiters(&journal->entry_waiters)) { struct slab_journal_block_header *header = &journal->tail_header; + if (journal->partial_write_in_progress || - is_vdo_slab_rebuilding(journal->slab)) { - // Don't add entries while rebuilding or while a - // partial write is outstanding (VDO-2399). + vdo_is_slab_rebuilding(journal->slab)) { + /* + * Don't add entries while rebuilding or while a + * partial write is outstanding (VDO-2399). + */ break; } if (journal->waiting_to_commit) { - // If we are waiting for resources to write the tail - // block, and the tail block is full, we can't make - // another entry. + /* + * If we are waiting for resources to write the tail + * block, and the tail block is full, we can't make + * another entry. + */ WRITE_ONCE(journal->events->tail_busy_count, journal->events->tail_busy_count + 1); break; } else if (is_next_entry_a_block_map_increment(journal) && (header->entry_count >= journal->full_entries_per_block)) { - // The tail block does not have room for a block map - // increment, so commit it now. - commit_vdo_slab_journal_tail(journal); + /* + * The tail block does not have room for a block map + * increment, so commit it now. + */ + commit_tail(journal); if (journal->waiting_to_commit) { WRITE_ONCE(journal->events->tail_busy_count, journal->events->tail_busy_count @@ -1023,8 +1149,10 @@ static void add_entries(struct slab_journal *journal) } } - // If the slab is over the blocking threshold, make the vio - // wait. + /* + * If the slab is over the blocking threshold, make the vio + * wait. + */ if (requires_reaping(journal)) { WRITE_ONCE(journal->events->blocked_count, journal->events->blocked_count + 1); @@ -1035,9 +1163,11 @@ static void add_entries(struct slab_journal *journal) if (header->entry_count == 0) { struct journal_lock *lock = get_lock(journal, header->sequence_number); - // Check if the on disk slab journal is full. Because - // of the blocking and scrubbing thresholds, this - // should never happen. + /* + * Check if the on disk slab journal is full. Because + * of the blocking and scrubbing thresholds, this + * should never happen. + */ if (lock->count > 0) { ASSERT_LOG_ONLY((journal->head + journal->size) == journal->tail, @@ -1090,22 +1220,29 @@ static void add_entries(struct slab_journal *journal) journal->adding_entries = false; - // If there are no waiters, and we are flushing or saving, commit the - // tail block. - if (is_vdo_slab_draining(journal->slab) && - !is_vdo_state_suspending(&journal->slab->state) && + /* + * If there are no waiters, and we are flushing or saving, commit the + * tail block. + */ + if (vdo_is_slab_draining(journal->slab) && + !vdo_is_state_suspending(&journal->slab->state) && !has_waiters(&journal->entry_waiters)) { - commit_vdo_slab_journal_tail(journal); + commit_tail(journal); } } -/**********************************************************************/ -void add_vdo_slab_journal_entry(struct slab_journal *journal, +/** + * vdo_add_slab_journal_entry() - Add an entry to a slab journal. + * @journal: The slab journal to use. + * @data_vio: The data_vio for which to add the entry. + */ +void vdo_add_slab_journal_entry(struct slab_journal *journal, struct data_vio *data_vio) { + struct vdo_slab *slab = journal->slab; int result; - if (!is_vdo_slab_open(journal->slab)) { + if (!vdo_is_slab_open(slab)) { continue_data_vio(data_vio, VDO_INVALID_ADMIN_STATE); return; } @@ -1121,15 +1258,25 @@ void add_vdo_slab_journal_entry(struct slab_journal *journal, return; } - if (is_unrecovered_vdo_slab(journal->slab) && requires_reaping(journal)) { - increase_vdo_slab_scrubbing_priority(journal->slab); + if (vdo_is_unrecovered_slab(slab) && requires_reaping(journal)) { + struct slab_scrubber *scrubber + = slab->allocator->slab_scrubber; + vdo_register_slab_for_scrubbing(scrubber, slab, true); } add_entries(journal); } -/**********************************************************************/ -void adjust_vdo_slab_journal_block_reference(struct slab_journal *journal, +/** + * vdo_adjust_slab_journal_block_reference() - Adjust the reference count for + * a slab journal block. + * @journal: The slab journal. + * @sequence_number: The journal sequence number of the referenced block. + * @adjustment: Amount to adjust the reference counter. + * + * Note that when the adjustment is negative, the slab journal will be reaped. + */ +void vdo_adjust_slab_journal_block_reference(struct slab_journal *journal, sequence_number_t sequence_number, int adjustment) { @@ -1139,8 +1286,8 @@ void adjust_vdo_slab_journal_block_reference(struct slab_journal *journal, return; } - if (is_replaying_vdo_slab(journal->slab)) { - // Locks should not be used during offline replay. + if (vdo_is_replaying_slab(journal->slab)) { + /* Locks should not be used during offline replay. */ return; } @@ -1149,7 +1296,7 @@ void adjust_vdo_slab_journal_block_reference(struct slab_journal *journal, if (adjustment < 0) { ASSERT_LOG_ONLY((-adjustment <= lock->count), "adjustment %d of lock count %u for slab journal block %llu must not underflow", - adjustment, lock->count, + adjustment, lock->count, (unsigned long long) sequence_number); } @@ -1159,7 +1306,18 @@ void adjust_vdo_slab_journal_block_reference(struct slab_journal *journal, } } -/**********************************************************************/ +/** + * vdo_release_recovery_journal_lock() - Request the slab journal to release + * the recovery journal lock it may hold + * on a specified recovery journal + * block. + * @journal: The slab journal. + * @recovery_lock: The sequence number of the recovery journal block whose + * locks should be released. + * + * Return: true if the journal does hold a lock on the specified + * block (which it will release). + */ bool vdo_release_recovery_journal_lock(struct slab_journal *journal, sequence_number_t recovery_lock) { @@ -1174,24 +1332,49 @@ bool vdo_release_recovery_journal_lock(struct slab_journal *journal, return false; } - // All locks are held by the block which is in progress; write it. - commit_vdo_slab_journal_tail(journal); + /* All locks are held by the block which is in progress; write it. */ + commit_tail(journal); return true; } -/**********************************************************************/ -void drain_vdo_slab_journal(struct slab_journal *journal) +/** + * vdo_resume_slab_journal() - Reset slab journal state, if necessary, for + * a suspend-resume cycle. + * @journal: The journal to reset + * + * After a successful save, any info about locks, journal blocks + * partially filled, etc., is out of date and should be reset. + **/ +void vdo_resume_slab_journal(struct slab_journal *journal) +{ + struct vdo *vdo = journal->slab->allocator->depot->vdo; + if ((vdo->suspend_type == VDO_ADMIN_STATE_SAVING) && + !is_vdo_read_only(journal)) { + vdo_reopen_slab_journal(journal); + } +} + +/** + * vdo_drain_slab_journal() - Drain slab journal I/O. + * @journal: The journal to drain. + * + * Depending upon the type of drain (as recorded in the journal's slab), any + * dirty journal blocks may be written out. + */ +void vdo_drain_slab_journal(struct slab_journal *journal) { const struct admin_state_code *code - = get_vdo_admin_state_code(&journal->slab->state); + = vdo_get_admin_state_code(&journal->slab->state); ASSERT_LOG_ONLY((vdo_get_callback_thread_id() == journal->slab->allocator->thread_id), - "drain_vdo_slab_journal() called on correct thread"); + "vdo_drain_slab_journal() called on correct thread"); if (code->quiescing) { - // XXX: we should revisit this assertion since it is no longer - // clear what it is for. - ASSERT_LOG_ONLY((!(is_vdo_slab_rebuilding(journal->slab) && + /* + * XXX: we should revisit this assertion since it is no longer + * clear what it is for. + */ + ASSERT_LOG_ONLY((!(vdo_is_slab_rebuilding(journal->slab) && has_waiters(&journal->entry_waiters))), "slab is recovered or has no waiters"); } @@ -1202,30 +1385,31 @@ void drain_vdo_slab_journal(struct slab_journal *journal) return; } - commit_vdo_slab_journal_tail(journal); + commit_tail(journal); } /** - * Finish the decode process by returning the vio and notifying the slab that - * we're done. - * - * @param completion The vio as a completion - **/ + * finish_decoding_journal() - Finish the decode process by returning the vio + * and notifying the slab that we're done. + * @completion: The vio as a completion. + */ static void finish_decoding_journal(struct vdo_completion *completion) { int result = completion->result; struct vio_pool_entry *entry = completion->parent; struct slab_journal *journal = entry->parent; - return_vdo_block_allocator_vio(journal->slab->allocator, entry); - notify_vdo_slab_journal_is_loaded(journal->slab, result); + + vdo_return_block_allocator_vio(journal->slab->allocator, entry); + vdo_notify_slab_journal_is_loaded(journal->slab, result); } /** - * Set up the in-memory journal state to the state which was written to disk. + * set_decoded_state() - Set up the in-memory journal state to the state which + * was written to disk. + * @completion: The vio which was used to read the journal tail. + * * This is the callback registered in read_slab_journal_tail(). - * - * @param completion The vio which was used to read the journal tail - **/ + */ static void set_decoded_state(struct vdo_completion *completion) { struct vio_pool_entry *entry = completion->parent; @@ -1233,7 +1417,8 @@ static void set_decoded_state(struct vdo_completion *completion) struct packed_slab_journal_block *block = entry->buffer; struct slab_journal_block_header header; - unpack_vdo_slab_journal_block_header(&block->header, &header); + + vdo_unpack_slab_journal_block_header(&block->header, &header); if ((header.metadata_type != VDO_METADATA_SLAB_JOURNAL) || (header.nonce != journal->slab->allocator->nonce)) { @@ -1243,8 +1428,10 @@ static void set_decoded_state(struct vdo_completion *completion) journal->tail = header.sequence_number + 1; - // If the slab is clean, this implies the slab journal is empty, so - // advance the head appropriately. + /* + * If the slab is clean, this implies the slab journal is empty, so + * advance the head appropriately. + */ if (vdo_get_summarized_cleanliness(journal->summary, journal->slab->slab_number)) { journal->head = journal->tail; @@ -1257,14 +1444,32 @@ static void set_decoded_state(struct vdo_completion *completion) finish_decoding_journal(completion); } +static void read_slab_journal_tail_endio(struct bio *bio) +{ + struct vio *vio = bio->bi_private; + struct vio_pool_entry *entry = vio->completion.parent; + struct slab_journal *journal = entry->parent; + + continue_vio_after_io(vio, + set_decoded_state, + journal->slab->allocator->thread_id); +} + +static void handle_decode_error(struct vdo_completion *completion) +{ + record_metadata_io_error(as_vio(completion)); + finish_decoding_journal(completion); +} + /** - * This reads the slab journal tail block by using a vio acquired from the vio - * pool. This is the success callback from acquire_vdo_block_allocator_vio() - * when decoding the slab journal. + * read_slab_journal_tail() - Read the slab journal tail block by using a vio + * acquired from the vio pool. + * @waiter: The vio pool waiter which has just been notified. + * @vio_context: The vio pool entry given to the waiter. * - * @param waiter The vio pool waiter which has just been notified - * @param vio_context The vio pool entry given to the waiter - **/ + * This is the success callback from vdo_acquire_block_allocator_vio() when + * decoding the slab journal. + */ static void read_slab_journal_tail(struct waiter *waiter, void *vio_context) { struct slab_journal *journal = @@ -1274,9 +1479,11 @@ static void read_slab_journal_tail(struct waiter *waiter, void *vio_context) tail_block_offset_t last_commit_point = vdo_get_summarized_tail_block_offset(journal->summary, slab->slab_number); - // Slab summary keeps the commit point offset, so the tail block is - // the block before that. Calculation supports small journals in unit - // tests. + /* + * Slab summary keeps the commit point offset, so the tail block is + * the block before that. Calculation supports small journals in unit + * tests. + */ tail_block_offset_t tail_block = ((last_commit_point == 0) ? (tail_block_offset_t)(journal->size - 1) : (last_commit_point - 1)); @@ -1284,21 +1491,26 @@ static void read_slab_journal_tail(struct waiter *waiter, void *vio_context) entry->parent = journal; entry->vio->completion.callback_thread_id = slab->allocator->thread_id; - launch_read_metadata_vio(entry->vio, - slab->journal_origin + tail_block, - set_decoded_state, - finish_decoding_journal); + submit_metadata_vio(entry->vio, + slab->journal_origin + tail_block, + read_slab_journal_tail_endio, + handle_decode_error, + REQ_OP_READ); } -/**********************************************************************/ -void decode_vdo_slab_journal(struct slab_journal *journal) +/** + * vdo_decode_slab_journal() - Decode the slab journal by reading its tail. + * @journal: The journal to decode. + */ +void vdo_decode_slab_journal(struct slab_journal *journal) { struct vdo_slab *slab = journal->slab; tail_block_offset_t last_commit_point; int result; + ASSERT_LOG_ONLY((vdo_get_callback_thread_id() == journal->slab->allocator->thread_id), - "decode_vdo_slab_journal() called on correct thread"); + "vdo_decode_slab_journal() called on correct thread"); last_commit_point = vdo_get_summarized_tail_block_offset(journal->summary, slab->slab_number); @@ -1313,22 +1525,25 @@ void decode_vdo_slab_journal(struct slab_journal *journal) ASSERT_LOG_ONLY(((journal->size < 16) || (journal->scrubbing_threshold < (journal->size - 1))), "Scrubbing threshold protects against reads of unwritten slab journal blocks"); - notify_vdo_slab_journal_is_loaded(slab, VDO_SUCCESS); + vdo_notify_slab_journal_is_loaded(slab, VDO_SUCCESS); return; } journal->resource_waiter.callback = read_slab_journal_tail; - result = acquire_vdo_block_allocator_vio(slab->allocator, + result = vdo_acquire_block_allocator_vio(slab->allocator, &journal->resource_waiter); if (result != VDO_SUCCESS) { - notify_vdo_slab_journal_is_loaded(slab, result); + vdo_notify_slab_journal_is_loaded(slab, result); } } -/**********************************************************************/ -void dump_vdo_slab_journal(const struct slab_journal *journal) +/** + * vdo_dump_slab_journal() - Dump the slab journal. + * @journal: The slab journal to dump. + */ +void vdo_dump_slab_journal(const struct slab_journal *journal) { - uds_log_info(" slab journal: entry_waiters=%zu waiting_to_commit=%s" " updating_slab_summary=%s head=%llu unreapable=%llu tail=%llu next_commit=%llu summarized=%llu last_summarized=%llu recovery_lock=%llu dirty=%s", + uds_log_info(" slab journal: entry_waiters=%zu waiting_to_commit=%s updating_slab_summary=%s head=%llu unreapable=%llu tail=%llu next_commit=%llu summarized=%llu last_summarized=%llu recovery_lock=%llu dirty=%s", count_waiters(&journal->entry_waiters), uds_bool_to_string(journal->waiting_to_commit), uds_bool_to_string(journal->updating_slab_summary), @@ -1339,8 +1554,10 @@ void dump_vdo_slab_journal(const struct slab_journal *journal) (unsigned long long) journal->summarized, (unsigned long long) journal->last_summarized, (unsigned long long) journal->recovery_lock, - uds_bool_to_string(is_vdo_slab_journal_dirty(journal))); - // Given the frequency with which the locks are just a tiny bit off, it - // might be worth dumping all the locks, but that might be too much - // logging. + uds_bool_to_string(vdo_is_slab_journal_dirty(journal))); + /* + * Given the frequency with which the locks are just a tiny bit off, it + * might be worth dumping all the locks, but that might be too much + * logging. + */ } diff --git a/vdo/slab-journal.h b/vdo/slab-journal.h new file mode 100644 index 00000000..28322c26 --- /dev/null +++ b/vdo/slab-journal.h @@ -0,0 +1,231 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright Red Hat + */ + +#ifndef SLAB_JOURNAL_H +#define SLAB_JOURNAL_H + +#include + +#include "numeric.h" + +#include "completion.h" +#include "journal-point.h" +#include "types.h" + +#include "block-allocator.h" +#include "block-map-entry.h" +#include "journal-point.h" +#include "slab.h" +#include "slab-journal-format.h" +#include "slab-summary.h" +#include "statistics.h" +#include "wait-queue.h" + +struct journal_lock { + uint16_t count; + sequence_number_t recovery_start; +}; + +struct slab_journal { + /* A waiter object for getting a VIO pool entry */ + struct waiter resource_waiter; + /* A waiter object for updating the slab summary */ + struct waiter slab_summary_waiter; + /* A waiter object for getting a vio with which to flush */ + struct waiter flush_waiter; + /* The queue of VIOs waiting to make an entry */ + struct wait_queue entry_waiters; + /* The parent slab reference of this journal */ + struct vdo_slab *slab; + + /* Whether a tail block commit is pending */ + bool waiting_to_commit; + /* Whether the journal is updating the slab summary */ + bool updating_slab_summary; + /* Whether the journal is adding entries from the entry_waiters queue */ + bool adding_entries; + /* Whether a partial write is in progress */ + bool partial_write_in_progress; + + /* The oldest block in the journal on disk */ + sequence_number_t head; + /* The oldest block in the journal which may not be reaped */ + sequence_number_t unreapable; + /* The end of the half-open interval of the active journal */ + sequence_number_t tail; + /* The next journal block to be committed */ + sequence_number_t next_commit; + /* The tail sequence number that is written in the slab summary */ + sequence_number_t summarized; + /* The tail sequence number that was last summarized in slab summary */ + sequence_number_t last_summarized; + + /* The sequence number of the recovery journal lock */ + sequence_number_t recovery_lock; + + /* + * The number of entries which fit in a single block. Can't use the + * constant because unit tests change this number. + */ + journal_entry_count_t entries_per_block; + /* + * The number of full entries which fit in a single block. Can't use + * the constant because unit tests change this number. + */ + journal_entry_count_t full_entries_per_block; + + /* The recovery journal of the VDO (slab journal holds locks on it) */ + struct recovery_journal *recovery_journal; + + /* The slab summary to update tail block location */ + struct slab_summary_zone *summary; + /* The statistics shared by all slab journals in our physical zone */ + struct slab_journal_statistics *events; + /* + * A list of the VIO pool entries for outstanding journal block writes + */ + struct list_head uncommitted_blocks; + + /* + * The current tail block header state. This will be packed into + * the block just before it is written. + */ + struct slab_journal_block_header tail_header; + /* A pointer to a block-sized buffer holding the packed block data */ + struct packed_slab_journal_block *block; + + /* The number of blocks in the on-disk journal */ + block_count_t size; + /* The number of blocks at which to start pushing reference blocks */ + block_count_t flushing_threshold; + /* + * The number of blocks at which all reference blocks should be writing + */ + block_count_t flushing_deadline; + /* + * The number of blocks at which to wait for reference blocks to write + */ + block_count_t blocking_threshold; + /* + * The number of blocks at which to scrub the slab before coming online + */ + block_count_t scrubbing_threshold; + + /* + * This list entry is for block_allocator to keep a queue of dirty + * journals + */ + struct list_head dirty_entry; + + /* The lock for the oldest unreaped block of the journal */ + struct journal_lock *reap_lock; + /* The locks for each on disk block */ + struct journal_lock locks[]; +}; + +/** + * vdo_pack_slab_journal_entry() - Generate the packed encoding of a + * slab journal entry. + * @packed: The entry into which to pack the values. + * @sbn: The slab block number of the entry to encode. + * @is_increment: The increment flag. + */ +static inline void vdo_pack_slab_journal_entry(packed_slab_journal_entry *packed, + slab_block_number sbn, + bool is_increment) +{ + packed->offset_low8 = (sbn & 0x0000FF); + packed->offset_mid8 = (sbn & 0x00FF00) >> 8; + packed->offset_high7 = (sbn & 0x7F0000) >> 16; + packed->increment = is_increment ? 1 : 0; +} + +/** + * vdo_unpack_slab_journal_block_header() - Decode the packed + * representation of a slab + * block header. + * @packed: The packed header to decode. + * @header: The header into which to unpack the values. + */ +static inline void +vdo_unpack_slab_journal_block_header( + const struct packed_slab_journal_block_header *packed, + struct slab_journal_block_header *header) +{ + *header = (struct slab_journal_block_header) { + .head = __le64_to_cpu(packed->head), + .sequence_number = __le64_to_cpu(packed->sequence_number), + .nonce = __le64_to_cpu(packed->nonce), + .entry_count = __le16_to_cpu(packed->entry_count), + .metadata_type = packed->metadata_type, + .has_block_map_increments = packed->has_block_map_increments, + }; + vdo_unpack_journal_point(&packed->recovery_point, + &header->recovery_point); +} + +struct slab_journal * __must_check +vdo_slab_journal_from_dirty_entry(struct list_head *entry); + +int __must_check vdo_make_slab_journal(struct block_allocator *allocator, + struct vdo_slab *slab, + struct recovery_journal *recovery_journal, + struct slab_journal **journal_ptr); + +void vdo_free_slab_journal(struct slab_journal *journal); + +bool __must_check vdo_is_slab_journal_blank(const struct slab_journal *journal); + +bool __must_check vdo_is_slab_journal_active(struct slab_journal *journal); + +void vdo_abort_slab_journal_waiters(struct slab_journal *journal); + +void vdo_reopen_slab_journal(struct slab_journal *journal); + +bool __must_check +vdo_attempt_replay_into_slab_journal(struct slab_journal *journal, + physical_block_number_t pbn, + enum journal_operation operation, + struct journal_point *recovery_point, + struct vdo_completion *parent); + +void vdo_add_slab_journal_entry(struct slab_journal *journal, + struct data_vio *data_vio); + +void vdo_adjust_slab_journal_block_reference(struct slab_journal *journal, + sequence_number_t sequence_number, + int adjustment); + +bool __must_check +vdo_release_recovery_journal_lock(struct slab_journal *journal, + sequence_number_t recovery_lock); + +void vdo_drain_slab_journal(struct slab_journal *journal); + +void vdo_decode_slab_journal(struct slab_journal *journal); + +bool __must_check +vdo_slab_journal_requires_scrubbing(const struct slab_journal *journal); + +/** + * vdo_get_slab_journal_block_offset() - Get the slab journal block offset of + * the given sequence number. + * @journal: The slab journal. + * @sequence: The sequence number. + * + * Return: The offset corresponding to the sequence number. + */ +static inline tail_block_offset_t __must_check +vdo_get_slab_journal_block_offset(struct slab_journal *journal, + sequence_number_t sequence) +{ + return (sequence % journal->size); +} + +void vdo_resume_slab_journal(struct slab_journal *journal); + +void vdo_dump_slab_journal(const struct slab_journal *journal); + +#endif /* SLAB_JOURNAL_H */ diff --git a/vdo/slabScrubber.c b/vdo/slab-scrubber.c similarity index 50% rename from vdo/slabScrubber.c rename to vdo/slab-scrubber.c index c459aa41..5af59b92 100644 --- a/vdo/slabScrubber.c +++ b/vdo/slab-scrubber.c @@ -1,55 +1,41 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/slabScrubber.c#35 $ */ -#include "slabScrubberInternals.h" +#include "slab-scrubber.h" + +#include #include "logger.h" -#include "memoryAlloc.h" +#include "memory-alloc.h" #include "permassert.h" -#include "adminState.h" -#include "blockAllocator.h" +#include "admin-state.h" +#include "block-allocator.h" #include "constants.h" -#include "readOnlyNotifier.h" -#include "recoveryJournal.h" -#include "refCounts.h" -#include "refCountsInternals.h" +#include "io-submitter.h" +#include "read-only-notifier.h" +#include "recovery-journal.h" +#include "ref-counts.h" #include "slab.h" -#include "slabJournalInternals.h" +#include "slab-journal.h" #include "vdo.h" +#include "vio.h" /** - * Allocate the buffer and extent used for reading the slab journal when - * scrubbing a slab. + * allocate_vio_and_buffer() - Allocate the buffer and vio used for reading the + * slab journal when scrubbing a slab. + * @scrubber: The slab scrubber for which to allocate. + * @vdo: The VDO in which the scrubber resides. + * @slab_journal_size: The size of a slab journal. * - * @param scrubber The slab scrubber for which to allocate - * @param vdo The VDO in which the scrubber resides - * @param slab_journal_size The size of a slab journal - * - * @return VDO_SUCCESS or an error - **/ + * Return: VDO_SUCCESS or an error. + */ static int __must_check -allocate_extent_and_buffer(struct slab_scrubber *scrubber, - struct vdo *vdo, - block_count_t slab_journal_size) +allocate_vio_and_buffer(struct slab_scrubber *scrubber, + struct vdo *vdo, + block_count_t slab_journal_size) { size_t buffer_size = VDO_BLOCK_SIZE * slab_journal_size; int result = UDS_ALLOCATE(buffer_size, char, __func__, @@ -58,72 +44,84 @@ allocate_extent_and_buffer(struct slab_scrubber *scrubber, return result; } - return create_vdo_extent(vdo, - VIO_TYPE_SLAB_JOURNAL, - VIO_PRIORITY_METADATA, - slab_journal_size, - scrubber->journal_data, - &scrubber->extent); + return create_multi_block_metadata_vio(vdo, + VIO_TYPE_SLAB_JOURNAL, + VIO_PRIORITY_METADATA, + scrubber, + slab_journal_size, + scrubber->journal_data, + &scrubber->vio); } -/**********************************************************************/ -int make_vdo_slab_scrubber(struct vdo *vdo, +/** + * vdo_make_slab_scrubber() - Create a slab scrubber. + * @vdo: The VDO. + * @slab_journal_size: The size of a slab journal in blocks. + * @read_only_notifier: The context for entering read-only mode. + * @scrubber_ptr: A pointer to hold the scrubber. + * + * Return: VDO_SUCCESS or an error. + */ +int vdo_make_slab_scrubber(struct vdo *vdo, block_count_t slab_journal_size, struct read_only_notifier *read_only_notifier, struct slab_scrubber **scrubber_ptr) { struct slab_scrubber *scrubber; int result = UDS_ALLOCATE(1, struct slab_scrubber, __func__, &scrubber); + if (result != VDO_SUCCESS) { return result; } - result = allocate_extent_and_buffer(scrubber, vdo, slab_journal_size); + result = allocate_vio_and_buffer(scrubber, vdo, slab_journal_size); if (result != VDO_SUCCESS) { - free_vdo_slab_scrubber(scrubber); + vdo_free_slab_scrubber(scrubber); return result; } - initialize_vdo_completion(&scrubber->completion, vdo, + vdo_initialize_completion(&scrubber->completion, vdo, VDO_SLAB_SCRUBBER_COMPLETION); INIT_LIST_HEAD(&scrubber->high_priority_slabs); INIT_LIST_HEAD(&scrubber->slabs); scrubber->read_only_notifier = read_only_notifier; - set_vdo_admin_state_code(&scrubber->admin_state, + vdo_set_admin_state_code(&scrubber->admin_state, VDO_ADMIN_STATE_SUSPENDED); *scrubber_ptr = scrubber; return VDO_SUCCESS; } /** - * Free the extent and buffer used for reading slab journals. - * - * @param scrubber The scrubber + * free_vio_and_buffer() - Free the vio and buffer used for reading slab + * journals. + * @scrubber: The scrubber. **/ -static void free_extent_and_buffer(struct slab_scrubber *scrubber) +static void free_vio_and_buffer(struct slab_scrubber *scrubber) { - free_vdo_extent(UDS_FORGET(scrubber->extent)); + free_vio(UDS_FORGET(scrubber->vio)); UDS_FREE(UDS_FORGET(scrubber->journal_data)); } -/**********************************************************************/ -void free_vdo_slab_scrubber(struct slab_scrubber *scrubber) +/** + * vdo_free_slab_scrubber() - Free a slab scrubber. + * @scrubber: The scrubber to destroy. + */ +void vdo_free_slab_scrubber(struct slab_scrubber *scrubber) { if (scrubber == NULL) { return; } - free_extent_and_buffer(scrubber); + free_vio_and_buffer(scrubber); UDS_FREE(scrubber); } /** - * Get the next slab to scrub. - * - * @param scrubber The slab scrubber + * get_next_slab() - Get the next slab to scrub. + * @scrubber: The slab scrubber. * - * @return The next slab to scrub or NULL if there are none - **/ + * Return: The next slab to scrub or NULL if there are none. + */ static struct vdo_slab *get_next_slab(struct slab_scrubber *scrubber) { if (!list_empty(&scrubber->high_priority_slabs)) { @@ -137,19 +135,35 @@ static struct vdo_slab *get_next_slab(struct slab_scrubber *scrubber) return NULL; } -/**********************************************************************/ -bool vdo_has_slabs_to_scrub(struct slab_scrubber *scrubber) +/** + * has_slabs_to_scrub() - Check whether a scrubber has slabs to scrub. + * @scrubber: The scrubber to check. + * + * Return: true if the scrubber has slabs to scrub. + */ +static bool __must_check has_slabs_to_scrub(struct slab_scrubber *scrubber) { return (get_next_slab(scrubber) != NULL); } -/**********************************************************************/ -slab_count_t get_scrubber_vdo_slab_count(const struct slab_scrubber *scrubber) +/** + * vdo_get_scrubber_slab_count() - Get the number of slabs that are + * unrecovered or being scrubbed. + * @scrubber: The scrubber to query. + * + * Return: The number of slabs that are unrecovered or being scrubbed. + */ +slab_count_t vdo_get_scrubber_slab_count(const struct slab_scrubber *scrubber) { return READ_ONCE(scrubber->slab_count); } -/**********************************************************************/ +/** + * vdo_register_slab_for_scrubbing() - Register a slab with a scrubber. + * @scrubber: The scrubber. + * @slab: The slab to scrub. + * @high_priority: true if the slab should be put on the high-priority queue. + */ void vdo_register_slab_for_scrubbing(struct slab_scrubber *scrubber, struct vdo_slab *slab, bool high_priority) @@ -178,27 +192,28 @@ void vdo_register_slab_for_scrubbing(struct slab_scrubber *scrubber, } /** - * Stop scrubbing, either because there are no more slabs to scrub or because - * there's been an error. - * - * @param scrubber The scrubber - **/ + * finish_scrubbing() - Stop scrubbing, either because there are no more slabs + * to scrub or because there's been an error. + * @scrubber: The scrubber. + */ static void finish_scrubbing(struct slab_scrubber *scrubber) { bool notify; - if (!vdo_has_slabs_to_scrub(scrubber)) { - free_extent_and_buffer(scrubber); + if (!has_slabs_to_scrub(scrubber)) { + free_vio_and_buffer(scrubber); } - // Inform whoever is waiting that scrubbing has completed. - complete_vdo_completion(&scrubber->completion); + /* Inform whoever is waiting that scrubbing has completed. */ + vdo_complete_completion(&scrubber->completion); notify = has_waiters(&scrubber->waiters); - // Note that the scrubber has stopped, and inform anyone who might be - // waiting for that to happen. - if (!finish_vdo_draining(&scrubber->admin_state)) { + /* + * Note that the scrubber has stopped, and inform anyone who might be + * waiting for that to happen. + */ + if (!vdo_finish_draining(&scrubber->admin_state)) { WRITE_ONCE(scrubber->admin_state.current_state, VDO_ADMIN_STATE_SUSPENDED); } @@ -213,56 +228,55 @@ static void finish_scrubbing(struct slab_scrubber *scrubber) } } -/**********************************************************************/ static void scrub_next_slab(struct slab_scrubber *scrubber); /** - * Notify the scrubber that a slab has been scrubbed. This callback is - * registered in apply_journal_entries(). + * slab_scrubbed() - Notify the scrubber that a slab has been scrubbed. + * @completion: The slab rebuild completion. * - * @param completion The slab rebuild completion - **/ + * This callback is registered in apply_journal_entries(). + */ static void slab_scrubbed(struct vdo_completion *completion) { struct slab_scrubber *scrubber = completion->parent; - finish_scrubbing_vdo_slab(scrubber->slab); + + vdo_finish_scrubbing_slab(scrubber->slab); WRITE_ONCE(scrubber->slab_count, scrubber->slab_count - 1); scrub_next_slab(scrubber); } /** - * Abort scrubbing due to an error. - * - * @param scrubber The slab scrubber - * @param result The error - **/ + * abort_scrubbing() - Abort scrubbing due to an error. + * @scrubber: The slab scrubber. + * @result: The error. + */ static void abort_scrubbing(struct slab_scrubber *scrubber, int result) { vdo_enter_read_only_mode(scrubber->read_only_notifier, result); - set_vdo_completion_result(&scrubber->completion, result); + vdo_set_completion_result(&scrubber->completion, result); scrub_next_slab(scrubber); } /** - * Handle errors while rebuilding a slab. - * - * @param completion The slab rebuild completion - **/ + * handle_scrubber_error() - Handle errors while rebuilding a slab. + * @completion: The slab rebuild completion. + */ static void handle_scrubber_error(struct vdo_completion *completion) { + record_metadata_io_error(as_vio(completion)); abort_scrubbing(completion->parent, completion->result); } /** - * Apply all the entries in a block to the reference counts. - * - * @param block A block with entries to apply - * @param entry_count The number of entries to apply - * @param block_number The sequence number of the block - * @param slab The slab to apply the entries to + * apply_block_entries() - Apply all the entries in a block to the reference + * counts. + * @block: A block with entries to apply. + * @entry_count: The number of entries to apply. + * @block_number: The sequence number of the block. + * @slab: The slab to apply the entries to. * - * @return VDO_SUCCESS or an error code - **/ + * Return: VDO_SUCCESS or an error code. + */ static int apply_block_entries(struct packed_slab_journal_block *block, journal_entry_count_t entry_count, sequence_number_t block_number, @@ -275,12 +289,13 @@ static int apply_block_entries(struct packed_slab_journal_block *block, int result; slab_block_number max_sbn = slab->end - slab->start; + while (entry_point.entry_count < entry_count) { struct slab_journal_entry entry = - decode_vdo_slab_journal_entry(block, + vdo_decode_slab_journal_entry(block, entry_point.entry_count); if (entry.sbn > max_sbn) { - // This entry is out of bounds. + /* This entry is out of bounds. */ return uds_log_error_strerror(VDO_CORRUPT_JOURNAL, "vdo_slab journal entry (%llu, %u) had invalid offset %u in slab (size %u blocks)", (unsigned long long) block_number, @@ -296,7 +311,7 @@ static int apply_block_entries(struct packed_slab_journal_block *block, "vdo_slab journal entry (%llu, %u) (%s of offset %u) could not be applied in slab %u", (unsigned long long) block_number, entry_point.entry_count, - get_vdo_journal_operation_name(entry.operation), + vdo_get_journal_operation_name(entry.operation), entry.sbn, slab->slab_number); return result; @@ -308,11 +323,12 @@ static int apply_block_entries(struct packed_slab_journal_block *block, } /** - * Find the relevant extent of the slab journal and apply all valid entries. - * This is a callback registered in start_scrubbing(). + * apply_journal_entries() - Find the relevant vio of the slab journal and + * apply all valid entries. + * @completion: The metadata read vio completion. * - * @param completion The metadata read extent completion - **/ + * This is a callback registered in start_scrubbing(). + */ static void apply_journal_entries(struct vdo_completion *completion) { int result; @@ -321,30 +337,32 @@ static void apply_journal_entries(struct vdo_completion *completion) struct slab_journal *journal = slab->journal; struct ref_counts *reference_counts = slab->reference_counts; - // Find the boundaries of the useful part of the journal. + /* Find the boundaries of the useful part of the journal. */ sequence_number_t tail = journal->tail; tail_block_offset_t end_index = - get_vdo_slab_journal_block_offset(journal, tail - 1); + vdo_get_slab_journal_block_offset(journal, tail - 1); char *end_data = scrubber->journal_data + (end_index * VDO_BLOCK_SIZE); struct packed_slab_journal_block *end_block = (struct packed_slab_journal_block *) end_data; sequence_number_t head = __le64_to_cpu(end_block->header.head); tail_block_offset_t head_index = - get_vdo_slab_journal_block_offset(journal, head); + vdo_get_slab_journal_block_offset(journal, head); block_count_t index = head_index; struct journal_point ref_counts_point = reference_counts->slab_journal_point; struct journal_point last_entry_applied = ref_counts_point; sequence_number_t sequence; + for (sequence = head; sequence < tail; sequence++) { char *block_data = scrubber->journal_data + (index * VDO_BLOCK_SIZE); struct packed_slab_journal_block *block = (struct packed_slab_journal_block *) block_data; struct slab_journal_block_header header; - unpack_vdo_slab_journal_block_header(&block->header, &header); + + vdo_unpack_slab_journal_block_header(&block->header, &header); if ((header.nonce != slab->allocator->nonce) || (header.metadata_type != VDO_METADATA_SLAB_JOURNAL) || @@ -352,7 +370,7 @@ static void apply_journal_entries(struct vdo_completion *completion) (header.entry_count > journal->entries_per_block) || (header.has_block_map_increments && (header.entry_count > journal->full_entries_per_block))) { - // The block is not what we expect it to be. + /* The block is not what we expect it to be. */ uds_log_error("vdo_slab journal block for slab %u was invalid", slab->slab_number); abort_scrubbing(scrubber, VDO_CORRUPT_JOURNAL); @@ -374,9 +392,11 @@ static void apply_journal_entries(struct vdo_completion *completion) } } - // At the end of rebuild, the ref_counts should be accurate to the end - // of the journal we just applied. - result = ASSERT(!before_vdo_journal_point(&last_entry_applied, + /* + * At the end of rebuild, the ref_counts should be accurate to the end + * of the journal we just applied. + */ + result = ASSERT(!vdo_before_journal_point(&last_entry_applied, &ref_counts_point), "Refcounts are not more accurate than the slab journal"); if (result != VDO_SUCCESS) { @@ -384,55 +404,67 @@ static void apply_journal_entries(struct vdo_completion *completion) return; } - // Save out the rebuilt reference blocks. - prepare_vdo_completion(completion, + /* Save out the rebuilt reference blocks. */ + vdo_prepare_completion(completion, slab_scrubbed, handle_scrubber_error, completion->callback_thread_id, scrubber); - start_vdo_slab_action(slab, VDO_ADMIN_STATE_SAVE_FOR_SCRUBBING, + vdo_start_slab_action(slab, VDO_ADMIN_STATE_SAVE_FOR_SCRUBBING, completion); } +static void read_slab_journal_endio(struct bio *bio) +{ + struct vio *vio = bio->bi_private; + struct slab_scrubber *scrubber = vio->completion.parent; + + continue_vio_after_io(bio->bi_private, + apply_journal_entries, + scrubber->completion.callback_thread_id); +} + /** - * Read the current slab's journal from disk now that it has been flushed. - * This callback is registered in scrub_next_slab(). + * start_scrubbing() - Read the current slab's journal from disk now that it + * has been flushed. + * @completion: The scrubber's vio completion. * - * @param completion The scrubber's extent completion - **/ + * This callback is registered in scrub_next_slab(). + */ static void start_scrubbing(struct vdo_completion *completion) { struct slab_scrubber *scrubber = completion->parent; struct vdo_slab *slab = scrubber->slab; + if (vdo_get_summarized_cleanliness(slab->allocator->summary, slab->slab_number)) { slab_scrubbed(completion); return; } - prepare_vdo_completion(&scrubber->extent->completion, - apply_journal_entries, - handle_scrubber_error, - completion->callback_thread_id, - completion->parent); - read_vdo_metadata_extent(scrubber->extent, slab->journal_origin); + submit_metadata_vio(scrubber->vio, + slab->journal_origin, + read_slab_journal_endio, + handle_scrubber_error, + REQ_OP_READ); } /** - * Scrub the next slab if there is one. - * - * @param scrubber The scrubber - **/ + * scrub_next_slab() - Scrub the next slab if there is one. + * @scrubber: The scrubber. + */ static void scrub_next_slab(struct slab_scrubber *scrubber) { struct vdo_completion *completion; struct vdo_slab *slab; - // Note: this notify call is always safe only because scrubbing can - // only be started when the VDO is quiescent. + /* + * Note: this notify call is always safe only because scrubbing can + * only be started when the VDO is quiescent. + */ notify_all_waiters(&scrubber->waiters, NULL, NULL); if (vdo_is_read_only(scrubber->read_only_notifier)) { - set_vdo_completion_result(&scrubber->completion, VDO_READ_ONLY); + vdo_set_completion_result(&scrubber->completion, VDO_READ_ONLY); finish_scrubbing(scrubber); return; } @@ -445,35 +477,43 @@ static void scrub_next_slab(struct slab_scrubber *scrubber) return; } - if (finish_vdo_draining(&scrubber->admin_state)) { + if (vdo_finish_draining(&scrubber->admin_state)) { return; } list_del_init(&slab->allocq_entry); scrubber->slab = slab; - completion = vdo_extent_as_completion(scrubber->extent); - prepare_vdo_completion(completion, + completion = vio_as_completion(scrubber->vio); + vdo_prepare_completion(completion, start_scrubbing, handle_scrubber_error, scrubber->completion.callback_thread_id, scrubber); - start_vdo_slab_action(slab, VDO_ADMIN_STATE_SCRUBBING, completion); + vdo_start_slab_action(slab, VDO_ADMIN_STATE_SCRUBBING, completion); } -/**********************************************************************/ -void scrub_vdo_slabs(struct slab_scrubber *scrubber, +/** + * vdo_scrub_slabs() - Scrub all the slabs which have been registered with a + * slab scrubber. + * @scrubber: The scrubber. + * @parent: The object to notify when scrubbing is complete. + * @callback: The function to run when scrubbing is complete. + * @error_handler: The handler for scrubbing errors. + */ +void vdo_scrub_slabs(struct slab_scrubber *scrubber, void *parent, vdo_action *callback, vdo_action *error_handler) { thread_id_t thread_id = vdo_get_callback_thread_id(); - resume_vdo_if_quiescent(&scrubber->admin_state); - prepare_vdo_completion(&scrubber->completion, + + vdo_resume_if_quiescent(&scrubber->admin_state); + vdo_prepare_completion(&scrubber->completion, callback, error_handler, thread_id, parent); - if (!vdo_has_slabs_to_scrub(scrubber)) { + if (!has_slabs_to_scrub(scrubber)) { finish_scrubbing(scrubber); return; } @@ -481,8 +521,19 @@ void scrub_vdo_slabs(struct slab_scrubber *scrubber, scrub_next_slab(scrubber); } -/**********************************************************************/ -void scrub_high_priority_vdo_slabs(struct slab_scrubber *scrubber, +/** + * vdo_scrub_high_priority_slabs() - Scrub any slabs which have been + * registered at high priority with a slab + * scrubber. + * @scrubber: The scrubber. + * @scrub_at_least_one: true if one slab should always be scrubbed, even if + * there are no high-priority slabs (and there is at + * least one low priority slab). + * @parent: The completion to notify when scrubbing is complete. + * @callback: The function to run when scrubbing is complete. + * @error_handler: The handler for scrubbing errors. + */ +void vdo_scrub_high_priority_slabs(struct slab_scrubber *scrubber, bool scrub_at_least_one, struct vdo_completion *parent, vdo_action *callback, @@ -490,70 +541,92 @@ void scrub_high_priority_vdo_slabs(struct slab_scrubber *scrubber, { if (scrub_at_least_one && list_empty(&scrubber->high_priority_slabs)) { struct vdo_slab *slab = get_next_slab(scrubber); + if (slab != NULL) { vdo_register_slab_for_scrubbing(scrubber, slab, true); } } scrubber->high_priority_only = true; - scrub_vdo_slabs(scrubber, parent, callback, error_handler); + vdo_scrub_slabs(scrubber, parent, callback, error_handler); } -/**********************************************************************/ -void stop_vdo_slab_scrubbing(struct slab_scrubber *scrubber, +/** + * vdo_stop_slab_scrubbing() - Tell the scrubber to stop scrubbing after it + * finishes the slab it is currently working on. + * @scrubber: The scrubber to stop. + * @parent: The completion to notify when scrubbing has stopped. + */ +void vdo_stop_slab_scrubbing(struct slab_scrubber *scrubber, struct vdo_completion *parent) { - if (is_vdo_state_quiescent(&scrubber->admin_state)) { - complete_vdo_completion(parent); + if (vdo_is_state_quiescent(&scrubber->admin_state)) { + vdo_complete_completion(parent); } else { - start_vdo_draining(&scrubber->admin_state, + vdo_start_draining(&scrubber->admin_state, VDO_ADMIN_STATE_SUSPENDING, parent, NULL); } } -/**********************************************************************/ -void resume_vdo_slab_scrubbing(struct slab_scrubber *scrubber, +/** + * vdo_resume_slab_scrubbing() - Tell the scrubber to resume scrubbing if it + * has been stopped. + * @scrubber: The scrubber to resume. + * @parent: The object to notify once scrubbing has resumed. + */ +void vdo_resume_slab_scrubbing(struct slab_scrubber *scrubber, struct vdo_completion *parent) { int result; - if (!vdo_has_slabs_to_scrub(scrubber)) { - complete_vdo_completion(parent); + if (!has_slabs_to_scrub(scrubber)) { + vdo_complete_completion(parent); return; } - result = resume_vdo_if_quiescent(&scrubber->admin_state); + result = vdo_resume_if_quiescent(&scrubber->admin_state); if (result != VDO_SUCCESS) { - finish_vdo_completion(parent, result); + vdo_finish_completion(parent, result); return; } scrub_next_slab(scrubber); - complete_vdo_completion(parent); + vdo_complete_completion(parent); } -/**********************************************************************/ -int enqueue_clean_vdo_slab_waiter(struct slab_scrubber *scrubber, +/** + * vdo_enqueue_clean_slab_waiter() - Wait for a clean slab. + * @scrubber: The scrubber on which to wait. + * @waiter: The waiter. + * + * Return: VDO_SUCCESS if the waiter was queued, VDO_NO_SPACE if there are no + * slabs to scrub, and some other error otherwise. + */ +int vdo_enqueue_clean_slab_waiter(struct slab_scrubber *scrubber, struct waiter *waiter) { if (vdo_is_read_only(scrubber->read_only_notifier)) { return VDO_READ_ONLY; } - if (is_vdo_state_quiescent(&scrubber->admin_state)) { + if (vdo_is_state_quiescent(&scrubber->admin_state)) { return VDO_NO_SPACE; } return enqueue_waiter(&scrubber->waiters, waiter); } -/**********************************************************************/ -void dump_vdo_slab_scrubber(const struct slab_scrubber *scrubber) +/** + * vdo_dump_slab_scrubber() - Dump information about a slab scrubber to the + * log for debugging. + * @scrubber: The scrubber to dump. + */ +void vdo_dump_slab_scrubber(const struct slab_scrubber *scrubber) { uds_log_info("slab_scrubber slab_count %u waiters %zu %s%s", - get_scrubber_vdo_slab_count(scrubber), + vdo_get_scrubber_slab_count(scrubber), count_waiters(&scrubber->waiters), - get_vdo_admin_state_name(&scrubber->admin_state), + vdo_get_admin_state_code(&scrubber->admin_state)->name, scrubber->high_priority_only ? ", high_priority_only " : ""); } diff --git a/vdo/slab-scrubber.h b/vdo/slab-scrubber.h new file mode 100644 index 00000000..8a8c1fc1 --- /dev/null +++ b/vdo/slab-scrubber.h @@ -0,0 +1,83 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright Red Hat + */ + +#ifndef SLAB_SCRUBBER_H +#define SLAB_SCRUBBER_H + +#include + +#include "admin-state.h" +#include "completion.h" +#include "types.h" +#include "wait-queue.h" + +struct slab_scrubber { + struct vdo_completion completion; + /* The queue of slabs to scrub first */ + struct list_head high_priority_slabs; + /* The queue of slabs to scrub once there are no high_priority_slabs */ + struct list_head slabs; + /* The queue of VIOs waiting for a slab to be scrubbed */ + struct wait_queue waiters; + + /* + * The number of slabs that are unrecovered or being scrubbed. This + * field is modified by the physical zone thread, but is queried by + * other threads. + */ + slab_count_t slab_count; + + /* The administrative state of the scrubber */ + struct admin_state admin_state; + /* Whether to only scrub high-priority slabs */ + bool high_priority_only; + /* The context for entering read-only mode */ + struct read_only_notifier *read_only_notifier; + /* The slab currently being scrubbed */ + struct vdo_slab *slab; + /* The vio for loading slab journal blocks */ + struct vio *vio; + /* A buffer to store the slab journal blocks */ + char *journal_data; +}; + +int __must_check +vdo_make_slab_scrubber(struct vdo *vdo, + block_count_t slab_journal_size, + struct read_only_notifier *read_only_notifier, + struct slab_scrubber **scrubber_ptr); + +void vdo_free_slab_scrubber(struct slab_scrubber *scrubber); + +void vdo_register_slab_for_scrubbing(struct slab_scrubber *scrubber, + struct vdo_slab *slab, + bool high_priority); + +void vdo_scrub_slabs(struct slab_scrubber *scrubber, + void *parent, + vdo_action *callback, + vdo_action *error_handler); + +void vdo_scrub_high_priority_slabs(struct slab_scrubber *scrubber, + bool scrub_at_least_one, + struct vdo_completion *parent, + vdo_action *callback, + vdo_action *error_handler); + +void vdo_stop_slab_scrubbing(struct slab_scrubber *scrubber, + struct vdo_completion *parent); + +void vdo_resume_slab_scrubbing(struct slab_scrubber *scrubber, + struct vdo_completion *parent); + +int vdo_enqueue_clean_slab_waiter(struct slab_scrubber *scrubber, + struct waiter *waiter); + +slab_count_t __must_check +vdo_get_scrubber_slab_count(const struct slab_scrubber *scrubber); + +void vdo_dump_slab_scrubber(const struct slab_scrubber *scrubber); + +#endif /* SLAB_SCRUBBER_H */ diff --git a/vdo/slab-summary-format.h b/vdo/slab-summary-format.h new file mode 100644 index 00000000..07fc1997 --- /dev/null +++ b/vdo/slab-summary-format.h @@ -0,0 +1,88 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright Red Hat + */ + +#ifndef SLAB_SUMMARY_FORMAT_H +#define SLAB_SUMMARY_FORMAT_H + +#include "constants.h" +#include "types.h" + +/** + * typedef tail_block_offset_t - The offset of a slab journal tail block. + */ +typedef uint8_t tail_block_offset_t; + +enum { + VDO_SLAB_SUMMARY_FULLNESS_HINT_BITS = 6, +}; + +struct slab_summary_entry { + /* Bits 7..0: The offset of the tail block within the slab journal */ + tail_block_offset_t tail_block_offset; + +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + /* Bits 13..8: A hint about the fullness of the slab */ + unsigned int fullness_hint : 6; + /* Bit 14: Whether the ref_counts must be loaded from the layer */ + unsigned int load_ref_counts : 1; + /* Bit 15: The believed cleanliness of this slab */ + unsigned int is_dirty : 1; +#else + /* Bit 15: The believed cleanliness of this slab */ + unsigned int is_dirty : 1; + /* Bit 14: Whether the ref_counts must be loaded from the layer */ + unsigned int load_ref_counts : 1; + /* Bits 13..8: A hint about the fullness of the slab */ + unsigned int fullness_hint : 6; +#endif +} __packed; + +/* XXX: These methods shouldn't take a block_size parameter. */ + +/** + * vdo_get_slab_summary_zone_size() - Returns the size on disk of a single + * zone of the slab_summary. + * @block_size: The block size of the physical layer. + * + * Return: the number of blocks required to store a single zone of the + * slab_summary on disk. + */ +static inline block_count_t __must_check +vdo_get_slab_summary_zone_size(block_size_t block_size) +{ + slab_count_t entries_per_block = + block_size / sizeof(struct slab_summary_entry); + block_count_t blocks_needed = MAX_VDO_SLABS / entries_per_block; + return blocks_needed; +} + +/** + * vdo_get_slab_summary_size() - Return the size on disk of the slab_summary + * structure. + * @block_size: The block size of the physical layer. + * + * Return: The blocks required to store the slab_summary on disk. + */ +static inline block_count_t __must_check +vdo_get_slab_summary_size(block_size_t block_size) +{ + return vdo_get_slab_summary_zone_size(block_size) * MAX_VDO_PHYSICAL_ZONES; +} + +/** + * vdo_get_slab_summary_hint_shift() - Compute the shift for slab summary + * hints. + * @slab_size_shift: Exponent for the number of blocks per slab. + * + * Return: The hint shift. + */ +static inline uint8_t __must_check +vdo_get_slab_summary_hint_shift(unsigned int slab_size_shift) +{ + return ((slab_size_shift > VDO_SLAB_SUMMARY_FULLNESS_HINT_BITS) ? + (slab_size_shift - VDO_SLAB_SUMMARY_FULLNESS_HINT_BITS) : 0); +} + +#endif /* SLAB_SUMMARY_FORMAT_H */ diff --git a/vdo/slab-summary.c b/vdo/slab-summary.c new file mode 100644 index 00000000..23f1147c --- /dev/null +++ b/vdo/slab-summary.c @@ -0,0 +1,833 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright Red Hat + */ + +#include "slab-summary.h" + +#include + +#include "memory-alloc.h" +#include "permassert.h" + +#include "admin-state.h" +#include "completion.h" +#include "constants.h" +#include "io-submitter.h" +#include "read-only-notifier.h" +#include "slab-summary-format.h" +#include "thread-config.h" +#include "types.h" +#include "vio.h" + +/* FULLNESS HINT COMPUTATION */ + +/** + * compute_fullness_hint() - Translate a slab's free block count into a + * 'fullness hint' that can be stored in a + * slab_summary_entry's 7 bits that are dedicated to + * its free count. + * @summary: The summary which is being updated. + * @free_blocks: The number of free blocks. + * + * Note: the number of free blocks must be strictly less than 2^23 blocks, + * even though theoretically slabs could contain precisely 2^23 blocks; there + * is an assumption that at least one block is used by metadata. This + * assumption is necessary; otherwise, the fullness hint might overflow. The + * fullness hint formula is roughly (fullness >> 16) & 0x7f, but ((1 > 16) & + * 0x7f is the same as (0 >> 16) & 0x7f, namely 0, which is clearly a bad hint + * if it could indicate both 2^23 free blocks or 0 free blocks. + * + * Return: A fullness hint, which can be stored in 7 bits. + */ +static uint8_t __must_check +compute_fullness_hint(struct slab_summary *summary, block_count_t free_blocks) +{ + block_count_t hint; + + ASSERT_LOG_ONLY((free_blocks < (1 << 23)), + "free blocks must be less than 2^23"); + + if (free_blocks == 0) { + return 0; + } + + hint = free_blocks >> summary->hint_shift; + return ((hint == 0) ? 1 : hint); +} + +/** + * get_approximate_free_blocks() - Translate a slab's free block hint into an + * approximate count. + * @summary: The summary from which the hint was obtained. + * @free_block_hint: The hint read from the summary. + * + * compute_fullness_hint() is the inverse function of + * get_approximate_free_blocks() (i.e. + * compute_fullness_hint(get_approximate_free_blocks(x)) == x). + * + * Return: An approximation to the free block count. + */ +static block_count_t __must_check +get_approximate_free_blocks(struct slab_summary *summary, + uint8_t free_block_hint) +{ + return ((block_count_t) free_block_hint) << summary->hint_shift; +} + +/* MAKE/FREE FUNCTIONS */ + +static void launch_write(struct slab_summary_block *summary_block); + +/** + * initialize_slab_summary_block() - Initialize a slab_summary_block. + * @vdo: The vdo. + * @summary_zone: The parent slab_summary_zone. + * @entries: The entries this block manages. + * @index: The index of this block in its zone's summary. + * @slab_summary_block: The block to intialize. + * + * Return: VDO_SUCCESS or an error. + */ +static int +initialize_slab_summary_block(struct vdo *vdo, + struct slab_summary_zone *summary_zone, + struct slab_summary_entry *entries, + block_count_t index, + struct slab_summary_block *slab_summary_block) +{ + int result = UDS_ALLOCATE(VDO_BLOCK_SIZE, char, __func__, + &slab_summary_block->outgoing_entries); + if (result != VDO_SUCCESS) { + return result; + } + + result = create_metadata_vio(vdo, + VIO_TYPE_SLAB_SUMMARY, + VIO_PRIORITY_METADATA, + slab_summary_block, + slab_summary_block->outgoing_entries, + &slab_summary_block->vio); + if (result != VDO_SUCCESS) { + return result; + } + + slab_summary_block->zone = summary_zone; + slab_summary_block->entries = entries; + slab_summary_block->index = index; + return VDO_SUCCESS; +} + +/** + * make_slab_summary_zone() - Create a new, empty slab_summary_zone object. + * @summary: The summary to which the new zone will belong. + * @vdo: The vdo. + * @zone_number: The zone this is. + * @thread_id: The ID of the thread for this zone. + * @entries: The buffer to hold the entries in this zone. + * + * Return: VDO_SUCCESS or an error. + */ +static int make_slab_summary_zone(struct slab_summary *summary, + struct vdo *vdo, + zone_count_t zone_number, + thread_id_t thread_id, + struct slab_summary_entry *entries) +{ + struct slab_summary_zone *summary_zone; + block_count_t i; + int result = UDS_ALLOCATE_EXTENDED(struct slab_summary_zone, + summary->blocks_per_zone, + struct slab_summary_block, __func__, + &summary->zones[zone_number]); + if (result != VDO_SUCCESS) { + return result; + } + + summary_zone = summary->zones[zone_number]; + summary_zone->summary = summary; + summary_zone->zone_number = zone_number; + summary_zone->entries = entries; + summary_zone->thread_id = thread_id; + vdo_set_admin_state_code(&summary_zone->state, + VDO_ADMIN_STATE_NORMAL_OPERATION); + + /* Initialize each block. */ + for (i = 0; i < summary->blocks_per_zone; i++) { + result = initialize_slab_summary_block(vdo, + summary_zone, + entries, + i, + &summary_zone->summary_blocks[i]); + if (result != VDO_SUCCESS) { + return result; + } + entries += summary->entries_per_block; + } + + return VDO_SUCCESS; +} + +/** + * vdo_make_slab_summary() - Create a slab summary. + * @vdo: The vdo. + * @partition: The partition to hold the summary. + * @thread_config: The thread config of the VDO. + * @slab_size_shift: The number of bits in the slab size. + * @maximum_free_blocks_per_slab: The maximum number of free blocks a + * slab can have. + * @read_only_notifier: The context for entering read-only mode. + * @slab_summary_ptr: A pointer to hold the summary. + * + * Return: VDO_SUCCESS or an error. + */ +int vdo_make_slab_summary(struct vdo *vdo, + struct partition *partition, + const struct thread_config *thread_config, + unsigned int slab_size_shift, + block_count_t maximum_free_blocks_per_slab, + struct read_only_notifier *read_only_notifier, + struct slab_summary **slab_summary_ptr) +{ + struct slab_summary *summary; + size_t total_entries, i; + uint8_t hint; + zone_count_t zone; + block_count_t blocks_per_zone = + vdo_get_slab_summary_zone_size(VDO_BLOCK_SIZE); + slab_count_t entries_per_block = MAX_VDO_SLABS / blocks_per_zone; + int result = ASSERT((entries_per_block * blocks_per_zone) == MAX_VDO_SLABS, + "block size must be a multiple of entry size"); + if (result != VDO_SUCCESS) { + return result; + } + + if (partition == NULL) { + /* + * Don't make a slab summary for the formatter since it doesn't + * need it. + */ + return VDO_SUCCESS; + } + + result = UDS_ALLOCATE_EXTENDED(struct slab_summary, + thread_config->physical_zone_count, + struct slab_summary_zone *, + __func__, + &summary); + if (result != VDO_SUCCESS) { + return result; + } + + summary->zone_count = thread_config->physical_zone_count; + summary->read_only_notifier = read_only_notifier; + summary->hint_shift = vdo_get_slab_summary_hint_shift(slab_size_shift); + summary->blocks_per_zone = blocks_per_zone; + summary->entries_per_block = entries_per_block; + + total_entries = MAX_VDO_SLABS * MAX_VDO_PHYSICAL_ZONES; + result = UDS_ALLOCATE(total_entries, struct slab_summary_entry, + "summary entries", &summary->entries); + if (result != VDO_SUCCESS) { + vdo_free_slab_summary(summary); + return result; + } + + /* Initialize all the entries. */ + hint = compute_fullness_hint(summary, maximum_free_blocks_per_slab); + for (i = 0; i < total_entries; i++) { + /* + * This default tail block offset must be reflected in + * slabJournal.c::read_slab_journal_tail(). + */ + summary->entries[i] = (struct slab_summary_entry) { + .tail_block_offset = 0, + .fullness_hint = hint, + .load_ref_counts = false, + .is_dirty = false, + }; + } + + vdo_set_slab_summary_origin(summary, partition); + for (zone = 0; zone < summary->zone_count; zone++) { + result = + make_slab_summary_zone(summary, vdo, zone, + vdo_get_physical_zone_thread(thread_config, + zone), + summary->entries + + (MAX_VDO_SLABS * zone)); + if (result != VDO_SUCCESS) { + vdo_free_slab_summary(summary); + return result; + } + } + + *slab_summary_ptr = summary; + return VDO_SUCCESS; +} + +/** + * free_summary_zone() - Free a slab summary zone. + * @zone: The zone to free. + */ +static void free_summary_zone(struct slab_summary_zone *zone) +{ + block_count_t i; + + if (zone == NULL) { + return; + } + + for (i = 0; i < zone->summary->blocks_per_zone; i++) { + free_vio(UDS_FORGET(zone->summary_blocks[i].vio)); + UDS_FREE(UDS_FORGET(zone->summary_blocks[i].outgoing_entries)); + } + + UDS_FREE(zone); +} + +/** + * vdo_free_slab_summary() - Destroy a slab summary. + * @summary: The slab summary to free. + */ +void vdo_free_slab_summary(struct slab_summary *summary) +{ + zone_count_t zone; + + if (summary == NULL) { + return; + } + + for (zone = 0; zone < summary->zone_count; zone++) { + free_summary_zone(UDS_FORGET(summary->zones[zone])); + } + + UDS_FREE(UDS_FORGET(summary->entries)); + UDS_FREE(summary); +} + +/** + * vdo_get_slab_summary_for_zone() - Get the portion of the slab + * summary for a specified zone. + * @summary: The slab summary. + * @zone: The zone. + * + * Return: The portion of the slab summary for the specified zone. + */ +struct slab_summary_zone * +vdo_get_slab_summary_for_zone(struct slab_summary *summary, zone_count_t zone) +{ + return summary->zones[zone]; +} + +/* WRITING FUNCTIONALITY */ + +/** + * check_for_drain_complete() - Check whether a summary zone has finished + * draining. + * @summary_zone: The zone to check. + */ +static void +check_for_drain_complete(struct slab_summary_zone *summary_zone) +{ + if (!vdo_is_state_draining(&summary_zone->state) + || (summary_zone->write_count > 0)) { + return; + } + + vdo_finish_operation(&summary_zone->state, + (vdo_is_read_only(summary_zone->summary->read_only_notifier) + ? VDO_READ_ONLY : VDO_SUCCESS)); +} + +/** + * notify_waiters() - Wake all the waiters in a given queue. + * @summary_zone: The slab summary which owns the queue. + * @queue: The queue to notify. + * + * If the VDO is in read-only mode the waiters will be given a VDO_READ_ONLY + * error code as their context, otherwise they will be given VDO_SUCCESS. + */ +static void notify_waiters(struct slab_summary_zone *summary_zone, + struct wait_queue *queue) +{ + int result = (vdo_is_read_only(summary_zone->summary->read_only_notifier) + ? VDO_READ_ONLY + : VDO_SUCCESS); + notify_all_waiters(queue, NULL, &result); +} + +/** + * finish_updating_slab_summary_block() - Finish processing a block which + * attempted to write, whether or not + * the attempt succeeded. + * @block: The block. + */ +static void +finish_updating_slab_summary_block(struct slab_summary_block *block) +{ + notify_waiters(block->zone, &block->current_update_waiters); + block->writing = false; + block->zone->write_count--; + if (has_waiters(&block->next_update_waiters)) { + launch_write(block); + } else { + check_for_drain_complete(block->zone); + } +} + +/** + * finish_update() - This is the callback for a successful block write. + * @completion: The write VIO. + */ +static void finish_update(struct vdo_completion *completion) +{ + struct slab_summary_block *block = completion->parent; + + atomic64_inc(&block->zone->summary->statistics.blocks_written); + finish_updating_slab_summary_block(block); +} + +/** + * handle_write_error() - Handle an error writing a slab summary block. + * @completion: The write VIO. + */ +static void handle_write_error(struct vdo_completion *completion) +{ + struct slab_summary_block *block = completion->parent; + + record_metadata_io_error(as_vio(completion)); + vdo_enter_read_only_mode(block->zone->summary->read_only_notifier, + completion->result); + finish_updating_slab_summary_block(block); +} + +static void write_slab_summary_endio(struct bio *bio) +{ + struct vio *vio = bio->bi_private; + struct slab_summary_block *block = vio->completion.parent; + + continue_vio_after_io(vio, finish_update, block->zone->thread_id); +} + +/** + * launch_write() - Write a slab summary block unless it is currently out for + * writing. + * @block: The block that needs to be committed. + */ +static void launch_write(struct slab_summary_block *block) +{ + struct slab_summary_zone *zone = block->zone; + struct slab_summary *summary = zone->summary; + physical_block_number_t pbn; + + if (block->writing) { + return; + } + + zone->write_count++; + transfer_all_waiters(&block->next_update_waiters, + &block->current_update_waiters); + block->writing = true; + + if (vdo_is_read_only(summary->read_only_notifier)) { + finish_updating_slab_summary_block(block); + return; + } + + memcpy(block->outgoing_entries, block->entries, + sizeof(struct slab_summary_entry) * summary->entries_per_block); + + /* + * Flush before writing to ensure that the slab journal tail blocks and + * reference updates covered by this summary update are stable + * (VDO-2332). + */ + pbn = summary->origin + + (summary->blocks_per_zone * zone->zone_number) + block->index; + submit_metadata_vio(block->vio, + pbn, + write_slab_summary_endio, + handle_write_error, + REQ_OP_WRITE | REQ_PREFLUSH); +} + +/** + * initiate_drain() - Initiate a drain. + * + * Implements vdo_admin_initiator. + */ +static void initiate_drain(struct admin_state *state) +{ + check_for_drain_complete(container_of(state, + struct slab_summary_zone, + state)); +} + +/** + * vdo_drain_slab_summary_zone() - Drain a zone of the slab summary. + * @summary_zone: The zone to drain. + * @operation: The type of drain to perform. + * @parent: The object to notify when the suspend is complete. + */ +void vdo_drain_slab_summary_zone(struct slab_summary_zone *summary_zone, + const struct admin_state_code *operation, + struct vdo_completion *parent) +{ + vdo_start_draining(&summary_zone->state, operation, parent, + initiate_drain); +} + +/** + * vdo_resume_slab_summary_zone() - Resume a zone of the slab summary. + * @summary_zone: The zone to resume. + * @parent: The object to notify when the zone is resumed. + */ +void vdo_resume_slab_summary_zone(struct slab_summary_zone *summary_zone, + struct vdo_completion *parent) +{ + vdo_finish_completion(parent, + vdo_resume_if_quiescent(&summary_zone->state)); +} + +/* READ/UPDATE FUNCTIONS */ + +/** + * get_summary_block_for_slab() - Get the summary block, and offset into it, + * for storing the summary for a slab. + * @summary_zone: The slab_summary_zone being queried. + * @slab_number: The slab whose summary location is sought. + * + * Return: A pointer to the slab_summary_block containing this + * slab_summary_entry. + */ +static struct slab_summary_block * +get_summary_block_for_slab(struct slab_summary_zone *summary_zone, + slab_count_t slab_number) +{ + slab_count_t entries_per_block = + summary_zone->summary->entries_per_block; + return &summary_zone->summary_blocks[slab_number / entries_per_block]; +} + +/** + * vdo_update_slab_summary_entry() - Update the entry for a slab. + * @summary_zone: The slab_summary_zone for the zone of the slab. + * @waiter: The waiter that is updating the summary. + * @slab_number: The slab number to update. + * @tail_block_offset: The offset of slab journal's tail block. + * @load_ref_counts: Whether the ref_counts must be loaded from the layer on + * the next load. + * @is_clean: Whether the slab is clean. + * @free_blocks: The number of free blocks. + */ +void vdo_update_slab_summary_entry(struct slab_summary_zone *summary_zone, + struct waiter *waiter, slab_count_t slab_number, + tail_block_offset_t tail_block_offset, + bool load_ref_counts, bool is_clean, + block_count_t free_blocks) +{ + struct slab_summary_block *block = + get_summary_block_for_slab(summary_zone, slab_number); + int result; + + if (vdo_is_read_only(summary_zone->summary->read_only_notifier)) { + result = VDO_READ_ONLY; + } else if (vdo_is_state_draining(&summary_zone->state) + || vdo_is_state_quiescent(&summary_zone->state)) { + result = VDO_INVALID_ADMIN_STATE; + } else { + uint8_t hint = compute_fullness_hint(summary_zone->summary, + free_blocks); + struct slab_summary_entry *entry = + &summary_zone->entries[slab_number]; + *entry = (struct slab_summary_entry) { + .tail_block_offset = tail_block_offset, + .load_ref_counts = + (entry->load_ref_counts || load_ref_counts), + .is_dirty = !is_clean, + .fullness_hint = hint, + }; + result = enqueue_waiter(&block->next_update_waiters, waiter); + } + + if (result != VDO_SUCCESS) { + waiter->callback(waiter, &result); + return; + } + + launch_write(block); +} + +/** + * vdo_get_summarized_tail_block_offset() - Get the stored tail block offset + * for a slab. + * @summary_zone: The slab_summary_zone to use. + * @slab_number: The slab number to get the offset for. + * + * Return: The tail block offset for the slab. + */ +tail_block_offset_t +vdo_get_summarized_tail_block_offset(struct slab_summary_zone *summary_zone, + slab_count_t slab_number) +{ + return summary_zone->entries[slab_number].tail_block_offset; +} + +/** + * vdo_must_load_ref_counts() - Whether ref_counts must be loaded from the + * layer. + * @summary_zone: The slab_summary_zone to use. + * @slab_number: The slab number to get information for. + * + * Return: Whether ref_counts must be loaded. + */ +bool vdo_must_load_ref_counts(struct slab_summary_zone *summary_zone, + slab_count_t slab_number) +{ + return summary_zone->entries[slab_number].load_ref_counts; +} + +/** + * vdo_get_summarized_cleanliness() - Get the stored cleanliness information + * for a single slab. + * @summary_zone: The slab_summary_zone to use. + * @slab_number: The slab number to get information for. + * + * Return: Whether the slab is clean. + */ +bool vdo_get_summarized_cleanliness(struct slab_summary_zone *summary_zone, + slab_count_t slab_number) +{ + return !summary_zone->entries[slab_number].is_dirty; +} + +/** + * vdo_get_summarized_free_block_count() - Get the stored emptiness + * information for a single slab. + * @summary_zone: The slab_summary_zone to use. + * @slab_number: The slab number to get information for. + * + * Return: An approximation to the free blocks in the slab. + */ +block_count_t +vdo_get_summarized_free_block_count(struct slab_summary_zone *summary_zone, + slab_count_t slab_number) +{ + struct slab_summary_entry *entry = &summary_zone->entries[slab_number]; + + return get_approximate_free_blocks(summary_zone->summary, + entry->fullness_hint); +} + +/** + * vdo_get_summarized_slab_statuses() - Get the stored slab statuses for all + * slabs in a zone. + * + * @summary_zone: The slab_summary_zone to use. + * @slab_count: The number of slabs to fetch. + * @statuses: An array of slab_status structures to populate (in, out). + */ +void vdo_get_summarized_slab_statuses(struct slab_summary_zone *summary_zone, + slab_count_t slab_count, + struct slab_status *statuses) +{ + slab_count_t i; + + for (i = 0; i < slab_count; i++) { + statuses[i] = (struct slab_status){ + .slab_number = i, + .is_clean = !summary_zone->entries[i].is_dirty, + .emptiness = summary_zone->entries[i].fullness_hint}; + } +} + +/* RESIZE FUNCTIONS */ + +/** + * vdo_set_slab_summary_origin() - Set the origin of the slab summary relative + * to the physical layer. + * @summary: The slab_summary to update. + * @partition: The slab summary partition. + */ +void vdo_set_slab_summary_origin(struct slab_summary *summary, + struct partition *partition) +{ + summary->origin = vdo_get_fixed_layout_partition_offset(partition); +} + +/* COMBINING FUNCTIONS (LOAD) */ + +/** + * finish_combining_zones() - Clean up after saving out the combined slab + * summary. + * @completion: The vio which was used to write the summary data. + **/ +static void finish_combining_zones(struct vdo_completion *completion) +{ + struct slab_summary *summary = completion->parent; + int result = completion->result; + + free_vio(as_vio(UDS_FORGET(completion))); + vdo_finish_loading_with_result(&summary->zones[0]->state, result); +} + +static void handle_combining_error(struct vdo_completion *completion) +{ + record_metadata_io_error(as_vio(completion)); + finish_combining_zones(completion); +} + +/** + * combine_zones() - Treating the current entries buffer as the on-disk value + * of all zones, update every zone to the correct values for + * every slab. + * @summary: The summary whose entries should be combined. + */ +static void combine_zones(struct slab_summary *summary) +{ + /* + * Combine all the old summary data into the portion of the buffer + * corresponding to the first zone. + */ + zone_count_t zone = 0; + + if (summary->zones_to_combine > 1) { + slab_count_t entry_number; + + for (entry_number = 0; entry_number < MAX_VDO_SLABS; + entry_number++) { + if (zone != 0) { + memcpy(summary->entries + entry_number, + summary->entries + + (zone * MAX_VDO_SLABS) + + entry_number, + sizeof(struct slab_summary_entry)); + } + zone++; + if (zone == summary->zones_to_combine) { + zone = 0; + } + } + } + + /* Copy the combined data to each zones's region of the buffer. */ + for (zone = 1; zone < MAX_VDO_PHYSICAL_ZONES; zone++) { + memcpy(summary->entries + (zone * MAX_VDO_SLABS), + summary->entries, + MAX_VDO_SLABS * sizeof(struct slab_summary_entry)); + } +} + +static void write_summary_endio(struct bio *bio) +{ + struct vio *vio = bio->bi_private; + + continue_vio_after_io(vio, + finish_combining_zones, + vdo_from_vio(vio)->thread_config->admin_thread); +} + +/** + * finish_loading_summary() - Finish loading slab summary data. + * @completion: The vio which was used to read the summary data. + * + * Combines the slab summary data from all the previously written zones and + * copies the combined summary to each partition's data region. Then writes + * the combined summary back out to disk. This callback is registered in + * vdo_load_slab_summary(). + */ +static void finish_loading_summary(struct vdo_completion *completion) +{ + struct slab_summary *summary = completion->parent; + + /* Combine the zones so each zone is correct for all slabs. */ + combine_zones(summary); + + /* Write the combined summary back out. */ + submit_metadata_vio(as_vio(completion), + summary->origin, + write_summary_endio, + handle_combining_error, + REQ_OP_WRITE); +} + +static void load_summary_endio(struct bio *bio) +{ + struct vio *vio = bio->bi_private; + + continue_vio_after_io(vio, + finish_loading_summary, + vdo_from_vio(vio)->thread_config->admin_thread); +} + +/** + * vdo_load_slab_summary() - Load slab summary data. + * @summary: The summary to load. + * @operation: The type of load to perform. + * @zones_to_combine: The number of zones to be combined; if set to 0, + * all of the summary will be initialized as new. + * @parent: The parent of this operation. + * + * Reads in all the slab summary data from the slab summary partition, + * combines all the previously used zones into a single zone, and then writes + * the combined summary back out to each possible zones' summary region. + */ +void vdo_load_slab_summary(struct slab_summary *summary, + const struct admin_state_code *operation, + zone_count_t zones_to_combine, + struct vdo_completion *parent) +{ + struct vio *vio; + block_count_t blocks; + int result; + + struct slab_summary_zone *zone = summary->zones[0]; + + if (!vdo_start_loading(&zone->state, operation, parent, NULL)) { + return; + } + + blocks = summary->blocks_per_zone * MAX_VDO_PHYSICAL_ZONES; + result = create_multi_block_metadata_vio(parent->vdo, + VIO_TYPE_SLAB_SUMMARY, + VIO_PRIORITY_METADATA, + summary, + blocks, + (char *) summary->entries, + &vio); + if (result != VDO_SUCCESS) { + vdo_finish_loading_with_result(&zone->state, result); + return; + } + + if ((operation == VDO_ADMIN_STATE_FORMATTING) || + (operation == VDO_ADMIN_STATE_LOADING_FOR_REBUILD)) { + finish_loading_summary(vio_as_completion(vio)); + return; + } + + summary->zones_to_combine = zones_to_combine; + submit_metadata_vio(vio, + summary->origin, + load_summary_endio, + handle_combining_error, + REQ_OP_READ); +} + +/** + * vdo_get_slab_summary_statistics() - Fetch the cumulative statistics for all + * slab summary zones in a summary. + * @summary: The summary in question. + * + * Return: The cumulative slab summary statistics for the summary. + */ +struct slab_summary_statistics +vdo_get_slab_summary_statistics(const struct slab_summary *summary) +{ + const struct atomic_slab_summary_statistics *atoms = + &summary->statistics; + return (struct slab_summary_statistics) { + .blocks_written = atomic64_read(&atoms->blocks_written), + }; +} diff --git a/vdo/slab-summary.h b/vdo/slab-summary.h new file mode 100644 index 00000000..5fd2117d --- /dev/null +++ b/vdo/slab-summary.h @@ -0,0 +1,182 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright Red Hat + */ + +#ifndef SLAB_SUMMARY_H +#define SLAB_SUMMARY_H + +#include + +#include "admin-state.h" +#include "completion.h" +#include "slab.h" +#include "slab-summary-format.h" +#include "statistics.h" +#include "types.h" +#include "vdo-layout.h" +#include "wait-queue.h" + +/* + * The slab_summary provides hints during load and recovery about the state + * of the slabs in order to avoid the need to read the slab journals in their + * entirety before a VDO can come online. + * + * The information in the summary for each slab includes the rough number of + * free blocks (which is used to prioritize scrubbing), the cleanliness of a + * slab (so that clean slabs containing free space will be used on restart), + * and the location of the tail block of the slab's journal. + * + * The slab_summary has its own partition at the end of the volume which is + * sized to allow for a complete copy of the summary for each of up to 16 + * physical zones. + * + * During resize, the slab_summary moves its backing partition and is saved + * once moved; the slab_summary is not permitted to overwrite the previous + * recovery journal space. + * + * The slab_summary does not have its own version information, but relies on + * the VDO volume version number. + */ + +/* + * A slab status is a very small structure for use in determining the ordering + * of slabs in the scrubbing process. + */ +struct slab_status { + slab_count_t slab_number; + bool is_clean; + uint8_t emptiness; +}; + +struct slab_summary_block { + /* The zone to which this block belongs */ + struct slab_summary_zone *zone; + /* The index of this block in its zone's summary */ + block_count_t index; + /* Whether this block has a write outstanding */ + bool writing; + /* Ring of updates waiting on the outstanding write */ + struct wait_queue current_update_waiters; + /* Ring of updates waiting on the next write */ + struct wait_queue next_update_waiters; + /* The active slab_summary_entry array for this block */ + struct slab_summary_entry *entries; + /* The vio used to write this block */ + struct vio *vio; + /* The packed entries, one block long, backing the vio */ + char *outgoing_entries; +}; + +/* + * The statistics for all the slab summary zones owned by this slab summary. + * These fields are all mutated only by their physical zone threads, but are + * read by other threads when gathering statistics for the entire depot. + */ +struct atomic_slab_summary_statistics { + /* Number of blocks written */ + atomic64_t blocks_written; +}; + +struct slab_summary_zone { + /* The summary of which this is a zone */ + struct slab_summary *summary; + /* The number of this zone */ + zone_count_t zone_number; + /* The thread id of this zone */ + thread_id_t thread_id; + /* Count of the number of blocks currently out for writing */ + block_count_t write_count; + /* The state of this zone */ + struct admin_state state; + /* The array (owned by the blocks) of all entries */ + struct slab_summary_entry *entries; + /* The array of slab_summary_blocks */ + struct slab_summary_block summary_blocks[]; +}; + +struct slab_summary { + /* The context for entering read-only mode */ + struct read_only_notifier *read_only_notifier; + /* The statistics for this slab summary */ + struct atomic_slab_summary_statistics statistics; + /* The start of the slab summary partition relative to the layer */ + physical_block_number_t origin; + /* The number of bits to shift to get a 7-bit fullness hint */ + unsigned int hint_shift; + /* The number of blocks (calculated based on MAX_VDO_SLABS) */ + block_count_t blocks_per_zone; + /* The number of slabs per block (calculated from block size) */ + slab_count_t entries_per_block; + /* The entries for all of the zones the partition can hold */ + struct slab_summary_entry *entries; + /* + * The number of zones which were active at the time of the last update + */ + zone_count_t zones_to_combine; + /* The current number of active zones */ + zone_count_t zone_count; + /* The currently active zones */ + struct slab_summary_zone *zones[]; +}; + +int __must_check +vdo_make_slab_summary(struct vdo *vdo, + struct partition *partition, + const struct thread_config *thread_config, + unsigned int slab_size_shift, + block_count_t maximum_free_blocks_per_slab, + struct read_only_notifier *read_only_notifier, + struct slab_summary **slab_summary_ptr); + +void vdo_free_slab_summary(struct slab_summary *summary); + +struct slab_summary_zone * __must_check +vdo_get_slab_summary_for_zone(struct slab_summary *summary, zone_count_t zone); + +void vdo_drain_slab_summary_zone(struct slab_summary_zone *summary_zone, + const struct admin_state_code *operation, + struct vdo_completion *parent); + +void vdo_resume_slab_summary_zone(struct slab_summary_zone *summary_zone, + struct vdo_completion *parent); + +void vdo_update_slab_summary_entry(struct slab_summary_zone *summary_zone, + struct waiter *waiter, + slab_count_t slab_number, + tail_block_offset_t tail_block_offset, + bool load_ref_counts, + bool is_clean, + block_count_t free_blocks); + +tail_block_offset_t __must_check +vdo_get_summarized_tail_block_offset(struct slab_summary_zone *summary_zone, + slab_count_t slab_number); + +bool __must_check vdo_must_load_ref_counts(struct slab_summary_zone *summary_zone, + slab_count_t slab_number); + +bool __must_check +vdo_get_summarized_cleanliness(struct slab_summary_zone *summary_zone, + slab_count_t slab_number); + +block_count_t __must_check +vdo_get_summarized_free_block_count(struct slab_summary_zone *summary_zone, + slab_count_t slab_number); + +void vdo_get_summarized_slab_statuses(struct slab_summary_zone *summary_zone, + slab_count_t slab_count, + struct slab_status *statuses); + +void vdo_set_slab_summary_origin(struct slab_summary *summary, + struct partition *partition); + +void vdo_load_slab_summary(struct slab_summary *summary, + const struct admin_state_code *operation, + zone_count_t zones_to_combine, + struct vdo_completion *parent); + +struct slab_summary_statistics __must_check +vdo_get_slab_summary_statistics(const struct slab_summary *summary); + +#endif /* SLAB_SUMMARY_H */ diff --git a/vdo/slab.c b/vdo/slab.c index 22ee8bd4..01f5a8d6 100644 --- a/vdo/slab.c +++ b/vdo/slab.c @@ -1,46 +1,42 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/slab.c#29 $ */ #include "slab.h" #include "logger.h" -#include "memoryAlloc.h" +#include "memory-alloc.h" #include "permassert.h" -#include "adminState.h" -#include "blockAllocatorInternals.h" +#include "admin-state.h" +#include "block-allocator.h" #include "completion.h" #include "constants.h" -#include "numUtils.h" -#include "pbnLock.h" -#include "recoveryJournal.h" -#include "refCounts.h" -#include "slabDepot.h" -#include "slabJournal.h" -#include "slabJournalFormat.h" -#include "slabJournalInternals.h" -#include "slabSummary.h" - -/**********************************************************************/ -int make_vdo_slab(physical_block_number_t slab_origin, +#include "num-utils.h" +#include "pbn-lock.h" +#include "recovery-journal.h" +#include "ref-counts.h" +#include "slab-depot.h" +#include "slab-journal.h" +#include "slab-journal-format.h" +#include "slab-summary.h" + +/** + * vdo_make_slab() - Construct a new, empty slab. + * @slab_origin: The physical block number within the block allocator + * partition of the first block in the slab. + * @allocator: The block allocator to which the slab belongs. + * @translation: The translation from the depot's partition to the + * physical storage. + * @recovery_journal: The recovery journal of the VDO. + * @slab_number: The slab number of the slab. + * @is_new: true if this slab is being allocated as part of a resize. + * @slab_ptr: A pointer to receive the new slab. + * + * Return: VDO_SUCCESS or an error code. + */ +int vdo_make_slab(physical_block_number_t slab_origin, struct block_allocator *allocator, physical_block_number_t translation, struct recovery_journal *recovery_journal, @@ -49,10 +45,11 @@ int make_vdo_slab(physical_block_number_t slab_origin, struct vdo_slab **slab_ptr) { const struct slab_config *slab_config = - get_vdo_slab_config(allocator->depot); + vdo_get_slab_config(allocator->depot); struct vdo_slab *slab; int result = UDS_ALLOCATE(1, struct vdo_slab, __func__, &slab); + if (result != VDO_SUCCESS) { return result; } @@ -66,25 +63,25 @@ int make_vdo_slab(physical_block_number_t slab_origin, slab->ref_counts_origin = slab_origin + slab_config->data_blocks + translation; slab->journal_origin = - (get_vdo_slab_journal_start_block(slab_config, slab_origin) + (vdo_get_slab_journal_start_block(slab_config, slab_origin) + translation); - result = make_vdo_slab_journal(allocator, slab, recovery_journal, + result = vdo_make_slab_journal(allocator, slab, recovery_journal, &slab->journal); if (result != VDO_SUCCESS) { - free_vdo_slab(slab); + vdo_free_slab(slab); return result; } if (is_new) { - set_vdo_admin_state_code(&slab->state, VDO_ADMIN_STATE_NEW); - result = allocate_ref_counts_for_vdo_slab(slab); + vdo_set_admin_state_code(&slab->state, VDO_ADMIN_STATE_NEW); + result = vdo_allocate_ref_counts_for_slab(slab); if (result != VDO_SUCCESS) { - free_vdo_slab(slab); + vdo_free_slab(slab); return result; } } else { - set_vdo_admin_state_code(&slab->state, + vdo_set_admin_state_code(&slab->state, VDO_ADMIN_STATE_NORMAL_OPERATION); } @@ -92,12 +89,18 @@ int make_vdo_slab(physical_block_number_t slab_origin, return VDO_SUCCESS; } -/**********************************************************************/ -int allocate_ref_counts_for_vdo_slab(struct vdo_slab *slab) +/** + * vdo_allocate_ref_counts_for_slab() - Allocate the reference counts for a + * slab. + * @slab: The slab whose reference counts need allocation. + * + * Return: VDO_SUCCESS or an error code. + */ +int vdo_allocate_ref_counts_for_slab(struct vdo_slab *slab) { struct block_allocator *allocator = slab->allocator; const struct slab_config *slab_config - = get_vdo_slab_config(allocator->depot); + = vdo_get_slab_config(allocator->depot); int result = ASSERT(slab->reference_counts == NULL, "vdo_slab %u doesn't allocate refcounts twice", @@ -106,54 +109,85 @@ int allocate_ref_counts_for_vdo_slab(struct vdo_slab *slab) return result; } - return make_vdo_ref_counts(slab_config->data_blocks, + return vdo_make_ref_counts(slab_config->data_blocks, slab, slab->ref_counts_origin, allocator->read_only_notifier, &slab->reference_counts); } -/**********************************************************************/ -void free_vdo_slab(struct vdo_slab *slab) +/** + * vdo_free_slab() - Destroy a slab. + * @slab: The slab to destroy. + */ +void vdo_free_slab(struct vdo_slab *slab) { if (slab == NULL) { return; } list_del(&slab->allocq_entry); - free_vdo_slab_journal(UDS_FORGET(slab->journal)); - free_vdo_ref_counts(UDS_FORGET(slab->reference_counts)); + vdo_free_slab_journal(UDS_FORGET(slab->journal)); + vdo_free_ref_counts(UDS_FORGET(slab->reference_counts)); UDS_FREE(slab); } -/**********************************************************************/ -zone_count_t get_vdo_slab_zone_number(struct vdo_slab *slab) +/** + * vdo_get_slab_zone_number() - Get the physical zone number of a slab. + * @slab: The slab. + * + * Return: The number of the slab's physical zone. + */ +zone_count_t vdo_get_slab_zone_number(struct vdo_slab *slab) { return slab->allocator->zone_number; } -/**********************************************************************/ -void mark_vdo_slab_replaying(struct vdo_slab *slab) +/** + * vdo_mark_slab_replaying() - Mark a slab as replaying, during offline + * recovery. + * @slab: The slab to mark. + */ +void vdo_mark_slab_replaying(struct vdo_slab *slab) { if (slab->status == VDO_SLAB_REBUILT) { slab->status = VDO_SLAB_REPLAYING; } } -/**********************************************************************/ -void mark_vdo_slab_unrecovered(struct vdo_slab *slab) +/** + * vdo_mark_slab_unrecovered() - Mark a slab as unrecovered, for online + * recovery. + * @slab: The slab to mark. + */ +void vdo_mark_slab_unrecovered(struct vdo_slab *slab) { slab->status = VDO_SLAB_REQUIRES_SCRUBBING; } -/**********************************************************************/ +/** + * get_slab_free_block_count() - Get the current number of free blocks in a + * slab. + * @slab: The slab to query. + * + * Return: The number of free blocks in the slab. + */ block_count_t get_slab_free_block_count(const struct vdo_slab *slab) { return vdo_get_unreferenced_block_count(slab->reference_counts); } -/**********************************************************************/ -int modify_vdo_slab_reference_count(struct vdo_slab *slab, +/** + * vdo_modify_slab_reference_count() - Increment or decrement the reference + * count of a block in a slab. + * @slab: The slab containing the block (may be NULL when referencing the zero + * block). + * @journal_point: The slab journal entry corresponding to this change. + * @operation: The operation to perform on the reference count. + * + * Return: VDO_SUCCESS or an error. + */ +int vdo_modify_slab_reference_count(struct vdo_slab *slab, const struct journal_point *journal_point, struct reference_operation operation) { @@ -169,34 +203,39 @@ int modify_vdo_slab_reference_count(struct vdo_slab *slab, * scrubbing correct the refCount. Note that the slab journal has * already captured all refCount updates. */ - if (is_unrecovered_vdo_slab(slab)) { + if (vdo_is_unrecovered_slab(slab)) { sequence_number_t entry_lock = journal_point->sequence_number; - adjust_vdo_slab_journal_block_reference(slab->journal, + + vdo_adjust_slab_journal_block_reference(slab->journal, entry_lock, -1); return VDO_SUCCESS; } result = vdo_adjust_reference_count(slab->reference_counts, operation, - journal_point, - &free_status_changed); + journal_point, + &free_status_changed); if (result != VDO_SUCCESS) { return result; } if (free_status_changed) { - adjust_vdo_free_block_count(slab, - !is_vdo_journal_increment_operation(operation.type)); + vdo_adjust_free_block_count(slab, + !vdo_is_journal_increment_operation(operation.type)); } return VDO_SUCCESS; } -/**********************************************************************/ -void open_vdo_slab(struct vdo_slab *slab) +/** + * vdo_open_slab() - Perform all necessary initialization of a slab necessary + * for allocations. + * @slab: The slab. + */ +void vdo_open_slab(struct vdo_slab *slab) { vdo_reset_search_cursor(slab->reference_counts); - if (is_vdo_slab_journal_blank(slab->journal)) { + if (vdo_is_slab_journal_blank(slab->journal)) { WRITE_ONCE(slab->allocator->statistics.slabs_opened, slab->allocator->statistics.slabs_opened + 1); vdo_dirty_all_reference_blocks(slab->reference_counts); @@ -206,7 +245,16 @@ void open_vdo_slab(struct vdo_slab *slab) } } -/**********************************************************************/ +/** + * vdo_acquire_provisional_reference() - Acquire a provisional reference on + * behalf of a PBN lock if the block it + * locks is unreferenced. + * @slab: The slab which contains the block. + * @pbn: The physical block to reference. + * @lock: The lock. + * + * Return: VDO_SUCCESS or an error. + */ int vdo_acquire_provisional_reference(struct vdo_slab *slab, physical_block_number_t pbn, struct pbn_lock *lock) @@ -224,25 +272,34 @@ int vdo_acquire_provisional_reference(struct vdo_slab *slab, } if (vdo_pbn_lock_has_provisional_reference(lock)) { - adjust_vdo_free_block_count(slab, false); + vdo_adjust_free_block_count(slab, false); } return VDO_SUCCESS; } -/**********************************************************************/ +/** + * vdo_slab_block_number_from_pbn() - Determine the index within the slab of a + * particular physical block number. + * @slab: The slab. + * @physical_block_number: The physical block number. + * @slab_block_number_ptr: A pointer to the slab block number. + * + * Return: VDO_SUCCESS or an error code. + */ int vdo_slab_block_number_from_pbn(struct vdo_slab *slab, physical_block_number_t physical_block_number, slab_block_number *slab_block_number_ptr) { uint64_t slab_block_number; + if (physical_block_number < slab->start) { return VDO_OUT_OF_RANGE; } slab_block_number = physical_block_number - slab->start; if (slab_block_number - >= get_vdo_slab_config(slab->allocator->depot)->data_blocks) { + >= vdo_get_slab_config(slab->allocator->depot)->data_blocks) { return VDO_OUT_OF_RANGE; } @@ -250,130 +307,181 @@ int vdo_slab_block_number_from_pbn(struct vdo_slab *slab, return VDO_SUCCESS; } -/**********************************************************************/ -bool should_save_fully_built_vdo_slab(const struct vdo_slab *slab) +/** + * vdo_should_save_fully_built_slab() - Check whether the reference counts for + * a given rebuilt slab should be saved. + * @slab: The slab to check. + * + * Return: true if the slab should be saved. + */ +bool vdo_should_save_fully_built_slab(const struct vdo_slab *slab) { - // Write out the ref_counts if the slab has written them before, or it - // has any non-zero reference counts, or there are any slab journal - // blocks. + /* + * Write out the ref_counts if the slab has written them before, or it + * has any non-zero reference counts, or there are any slab journal + * blocks. + */ block_count_t data_blocks = - get_vdo_slab_config(slab->allocator->depot)->data_blocks; + vdo_get_slab_config(slab->allocator->depot)->data_blocks; return (vdo_must_load_ref_counts(slab->allocator->summary, slab->slab_number) || (get_slab_free_block_count(slab) != data_blocks) - || !is_vdo_slab_journal_blank(slab->journal)); + || !vdo_is_slab_journal_blank(slab->journal)); } /** - * Initiate a slab action. + * initiate_slab_action() - Initiate a slab action. * * Implements vdo_admin_initiator. - **/ + */ static void initiate_slab_action(struct admin_state *state) { struct vdo_slab *slab = container_of(state, struct vdo_slab, state); - if (is_vdo_state_draining(state)) { + if (vdo_is_state_draining(state)) { const struct admin_state_code *operation = - get_vdo_admin_state_code(state); + vdo_get_admin_state_code(state); if (operation == VDO_ADMIN_STATE_SCRUBBING) { slab->status = VDO_SLAB_REBUILDING; } - drain_vdo_slab_journal(slab->journal); + vdo_drain_slab_journal(slab->journal); if (slab->reference_counts != NULL) { - drain_vdo_ref_counts(slab->reference_counts); + vdo_drain_ref_counts(slab->reference_counts); } - check_if_vdo_slab_drained(slab); + vdo_check_if_slab_drained(slab); return; } - if (is_vdo_state_loading(state)) { - decode_vdo_slab_journal(slab->journal); + if (vdo_is_state_loading(state)) { + vdo_decode_slab_journal(slab->journal); return; } - if (is_vdo_state_resuming(state)) { - queue_vdo_slab(slab); - finish_vdo_resuming(state); + if (vdo_is_state_resuming(state)) { + vdo_queue_slab(slab); + vdo_finish_resuming(state); return; } - finish_vdo_operation(state, VDO_INVALID_ADMIN_STATE); + vdo_finish_operation(state, VDO_INVALID_ADMIN_STATE); } -/**********************************************************************/ -void start_vdo_slab_action(struct vdo_slab *slab, +/** + * vdo_start_slab_action() - Start an administrative operation on a slab. + * @slab: The slab to load. + * @operation: The type of load to perform. + * @parent: The object to notify when the operation is complete. + */ +void vdo_start_slab_action(struct vdo_slab *slab, const struct admin_state_code *operation, struct vdo_completion *parent) { - start_vdo_operation_with_waiter(&slab->state, operation, parent, + vdo_start_operation_with_waiter(&slab->state, operation, parent, initiate_slab_action); } -/**********************************************************************/ -void notify_vdo_slab_journal_is_loaded(struct vdo_slab *slab, int result) +/** + * vdo_notify_slab_journal_is_loaded() - Inform a slab that its journal has + * been loaded. + * @slab: The slab whose journal has been loaded. + * @result: The result of the load operation. + */ +void vdo_notify_slab_journal_is_loaded(struct vdo_slab *slab, int result) { - if ((result == VDO_SUCCESS) && is_vdo_state_clean_load(&slab->state)) { - // Since this is a normal or new load, we don't need the memory - // to read and process the recovery journal, so we can allocate - // reference counts now. - result = allocate_ref_counts_for_vdo_slab(slab); + if ((result == VDO_SUCCESS) && vdo_is_state_clean_load(&slab->state)) { + /* + * Since this is a normal or new load, we don't need the memory + * to read and process the recovery journal, so we can allocate + * reference counts now. + */ + result = vdo_allocate_ref_counts_for_slab(slab); } - finish_vdo_loading_with_result(&slab->state, result); + vdo_finish_loading_with_result(&slab->state, result); } -/**********************************************************************/ -bool is_vdo_slab_open(struct vdo_slab *slab) +/** + * vdo_is_slab_open() - Check whether a slab is open, i.e. is neither + * quiescent nor quiescing. + * @slab: The slab to check. + * + * Return: true if the slab is open. + */ +bool vdo_is_slab_open(struct vdo_slab *slab) { - return (!is_vdo_state_quiescing(&slab->state) && - !is_vdo_state_quiescent(&slab->state)); + return (!vdo_is_state_quiescing(&slab->state) && + !vdo_is_state_quiescent(&slab->state)); } -/**********************************************************************/ -bool is_vdo_slab_draining(struct vdo_slab *slab) +/** + * vdo_is_slab_draining() - Check whether a slab is currently draining. + * @slab: The slab to check. + * + * Return: true if the slab is performing a drain operation. + */ +bool vdo_is_slab_draining(struct vdo_slab *slab) { - return is_vdo_state_draining(&slab->state); + return vdo_is_state_draining(&slab->state); } -/**********************************************************************/ -void check_if_vdo_slab_drained(struct vdo_slab *slab) +/** + * vdo_check_if_slab_drained() - Check whether a slab has drained, and if so, + * send a notification thereof. + * @slab: The slab to check. + */ +void vdo_check_if_slab_drained(struct vdo_slab *slab) { - if (is_vdo_state_draining(&slab->state) && - !is_vdo_slab_journal_active(slab->journal) && + if (vdo_is_state_draining(&slab->state) && + !vdo_is_slab_journal_active(slab->journal) && ((slab->reference_counts == NULL) || - !are_vdo_ref_counts_active(slab->reference_counts))) { + !vdo_are_ref_counts_active(slab->reference_counts))) { int result = (vdo_is_read_only(slab->allocator->read_only_notifier) ? VDO_READ_ONLY : VDO_SUCCESS); - finish_vdo_draining_with_result(&slab->state, result); + vdo_finish_draining_with_result(&slab->state, result); } } -/**********************************************************************/ -void notify_vdo_slab_ref_counts_are_drained(struct vdo_slab *slab, int result) +/** + * vdo_notify_slab_ref_counts_are_drained() - Inform a slab that its + * ref_counts have finished + * draining. + * @slab: The slab whose ref_counts object has been drained. + * @result: The result of the drain operation. + */ +void vdo_notify_slab_ref_counts_are_drained(struct vdo_slab *slab, int result) { - finish_vdo_draining_with_result(&slab->state, result); + vdo_finish_draining_with_result(&slab->state, result); } -/**********************************************************************/ -bool is_vdo_slab_resuming(struct vdo_slab *slab) +/** + * vdo_is_slab_resuming() - Check whether a slab is currently resuming. + * @slab: The slab to check. + * + * Return: true if the slab is performing a resume operation. + */ +bool vdo_is_slab_resuming(struct vdo_slab *slab) { - return is_vdo_state_resuming(&slab->state); + return vdo_is_state_resuming(&slab->state); } -/**********************************************************************/ -void finish_scrubbing_vdo_slab(struct vdo_slab *slab) +/** + * vdo_finish_scrubbing_slab() - Finish scrubbing a slab. + * @slab: The slab whose reference counts have been rebuilt from its journal. + * + * Finishes scrubbing a slab now that it has been rebuilt by updating its + * status, queueing it for allocation, and reopening its journal. + */ +void vdo_finish_scrubbing_slab(struct vdo_slab *slab) { slab->status = VDO_SLAB_REBUILT; - queue_vdo_slab(slab); - reopen_vdo_slab_journal(slab->journal); + vdo_queue_slab(slab); + vdo_reopen_slab_journal(slab->journal); } -/**********************************************************************/ static const char *status_to_string(enum slab_rebuild_status status) { switch (status) { @@ -392,12 +500,17 @@ static const char *status_to_string(enum slab_rebuild_status status) } } -/**********************************************************************/ -void dump_vdo_slab(const struct vdo_slab *slab) +/** + * vdo_dump_slab() - Dump information about a slab to the log for debugging. + * @slab: The slab to dump. + */ +void vdo_dump_slab(const struct vdo_slab *slab) { if (slab->reference_counts != NULL) { - // Terse because there are a lot of slabs to dump and syslog is - // lossy. + /* + * Terse because there are a lot of slabs to dump and syslog is + * lossy. + */ uds_log_info("slab %u: P%u, %llu free", slab->slab_number, slab->priority, @@ -407,10 +520,10 @@ void dump_vdo_slab(const struct vdo_slab *slab) status_to_string(slab->status)); } - dump_vdo_slab_journal(slab->journal); + vdo_dump_slab_journal(slab->journal); if (slab->reference_counts != NULL) { - dump_vdo_ref_counts(slab->reference_counts); + vdo_dump_ref_counts(slab->reference_counts); } else { uds_log_info("refCounts is null"); } diff --git a/vdo/slab.h b/vdo/slab.h index 3561c469..33c439f8 100644 --- a/vdo/slab.h +++ b/vdo/slab.h @@ -1,22 +1,6 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/slab.h#15 $ */ #ifndef VDO_SLAB_H @@ -26,93 +10,75 @@ #include "permassert.h" -#include "adminState.h" -#include "fixedLayout.h" -#include "journalPoint.h" -#include "referenceOperation.h" +#include "admin-state.h" +#include "journal-point.h" +#include "reference-operation.h" #include "types.h" enum slab_rebuild_status { - VDO_SLAB_REBUILT = 0, + VDO_SLAB_REBUILT, VDO_SLAB_REPLAYING, VDO_SLAB_REQUIRES_SCRUBBING, VDO_SLAB_REQUIRES_HIGH_PRIORITY_SCRUBBING, VDO_SLAB_REBUILDING, }; -/** +/* * This is the type declaration for the vdo_slab type. A vdo_slab currently * consists of a run of 2^23 data blocks, but that will soon change to * dedicate a small number of those blocks for metadata storage for the * reference counts and slab journal for the slab. - **/ + */ struct vdo_slab { - /** A list entry to queue this slab in a block_allocator list */ + /* A list entry to queue this slab in a block_allocator list */ struct list_head allocq_entry; - /** The struct block_allocator that owns this slab */ + /* The struct block_allocator that owns this slab */ struct block_allocator *allocator; - /** The reference counts for the data blocks in this slab */ + /* The reference counts for the data blocks in this slab */ struct ref_counts *reference_counts; - /** The journal for this slab */ + /* The journal for this slab */ struct slab_journal *journal; - /** The slab number of this slab */ + /* The slab number of this slab */ slab_count_t slab_number; - /** + /* * The offset in the allocator partition of the first block in this * slab */ physical_block_number_t start; - /** The offset of the first block past the end of this slab */ + /* The offset of the first block past the end of this slab */ physical_block_number_t end; - /** The starting translated PBN of the slab journal */ + /* The starting translated PBN of the slab journal */ physical_block_number_t journal_origin; - /** The starting translated PBN of the reference counts */ + /* The starting translated PBN of the reference counts */ physical_block_number_t ref_counts_origin; - /** The administrative state of the slab */ + /* The administrative state of the slab */ struct admin_state state; - /** The status of the slab */ + /* The status of the slab */ enum slab_rebuild_status status; - /** Whether the slab was ever queued for scrubbing */ + /* Whether the slab was ever queued for scrubbing */ bool was_queued_for_scrubbing; - /** The priority at which this slab has been queued for allocation */ + /* The priority at which this slab has been queued for allocation */ uint8_t priority; }; /** - * Convert a vdo_slab's list entry back to the vdo_slab. + * vdo_slab_from_list_entry() - Convert a vdo_slab's list entry back to the + * vdo_slab. + * @entry: The list entry to convert. * - * @param entry The list entry to convert - * - * @return The list entry as a vdo_slab - **/ + * Return: The list entry as a vdo_slab. + */ static inline struct vdo_slab *vdo_slab_from_list_entry(struct list_head *entry) { return list_entry(entry, struct vdo_slab, allocq_entry); } -/** - * Construct a new, empty slab. - * - * @param [in] slab_origin The physical block number within the block - * allocator partition of the first block in the - * slab - * @param [in] allocator The block allocator to which the slab belongs - * @param [in] translation The translation from the depot's partition to - * the physical storage - * @param [in] recovery_journal The recovery journal of the VDO - * @param [in] slab_number The slab number of the slab - * @param [in] is_new true if this slab is being - * allocated as part of a resize - * @param [out] slab_ptr A pointer to receive the new slab - * - * @return VDO_SUCCESS or an error code - **/ -int __must_check make_vdo_slab(physical_block_number_t slab_origin, +int __must_check vdo_make_slab(physical_block_number_t slab_origin, struct block_allocator *allocator, physical_block_number_t translation, struct recovery_journal *recovery_journal, @@ -120,226 +86,89 @@ int __must_check make_vdo_slab(physical_block_number_t slab_origin, bool is_new, struct vdo_slab **slab_ptr); -/** - * Allocate the reference counts for a slab. - * - * @param slab The slab whose reference counts need allocation. - * - * @return VDO_SUCCESS or an error code - **/ -int __must_check allocate_ref_counts_for_vdo_slab(struct vdo_slab *slab); +int __must_check vdo_allocate_ref_counts_for_slab(struct vdo_slab *slab); -/** - * Destroy a slab. - * - * @param slab The slab to destroy - **/ -void free_vdo_slab(struct vdo_slab *slab); +void vdo_free_slab(struct vdo_slab *slab); -/** - * Get the physical zone number of a slab. - * - * @param slab The slab - * - * @return The number of the slab's physical zone - **/ -zone_count_t __must_check get_vdo_slab_zone_number(struct vdo_slab *slab); +zone_count_t __must_check vdo_get_slab_zone_number(struct vdo_slab *slab); /** - * Check whether a slab is unrecovered. - * - * @param slab The slab to check + * vdo_is_unrecovered_slab() - Check whether a slab is unrecovered. + * @slab: The slab to check. * - * @return true if the slab is unrecovered - **/ -static inline bool is_unrecovered_vdo_slab(const struct vdo_slab *slab) + * Return: true if the slab is unrecovered. + */ +static inline bool vdo_is_unrecovered_slab(const struct vdo_slab *slab) { return (slab->status != VDO_SLAB_REBUILT); } /** - * Check whether a slab is being replayed into. - * - * @param slab The slab to check + * vdo_is_replaying_slab() - Check whether a slab is being replayed into. + * @slab: The slab to check. * - * @return true if the slab is replaying - **/ -static inline bool is_replaying_vdo_slab(const struct vdo_slab *slab) + * Return: true if the slab is replaying. + */ +static inline bool vdo_is_replaying_slab(const struct vdo_slab *slab) { return (slab->status == VDO_SLAB_REPLAYING); } /** - * Check whether a slab is being rebuilt. - * - * @param slab The slab to check + * vdo_is_slab_rebuilding() - Check whether a slab is being rebuilt. + * @slab: The slab to check. * - * @return true if the slab is being rebuilt - **/ -static inline bool is_vdo_slab_rebuilding(const struct vdo_slab *slab) + * Return: true if the slab is being rebuilt. + */ +static inline bool vdo_is_slab_rebuilding(const struct vdo_slab *slab) { return (slab->status == VDO_SLAB_REBUILDING); } -/** - * Mark a slab as replaying, during offline recovery. - * - * @param slab The slab to mark - **/ -void mark_vdo_slab_replaying(struct vdo_slab *slab); +void vdo_mark_slab_replaying(struct vdo_slab *slab); -/** - * Mark a slab as unrecovered, for online recovery. - * - * @param slab The slab to mark - **/ -void mark_vdo_slab_unrecovered(struct vdo_slab *slab); +void vdo_mark_slab_unrecovered(struct vdo_slab *slab); -/** - * Perform all necessary initialization of a slab necessary for allocations. - * - * @param slab The slab - **/ -void open_vdo_slab(struct vdo_slab *slab); +void vdo_open_slab(struct vdo_slab *slab); -/** - * Get the current number of free blocks in a slab. - * - * @param slab The slab to query - * - * @return the number of free blocks in the slab - **/ block_count_t __must_check get_slab_free_block_count(const struct vdo_slab *slab); -/** - * Increment or decrement the reference count of a block in a slab. - * - * @param slab The slab containing the block (may be NULL when - * referencing the zero block) - * @param journal_point The slab journal entry corresponding to this change - * @param operation The operation to perform on the reference count - * - * @return VDO_SUCCESS or an error - **/ int __must_check -modify_vdo_slab_reference_count(struct vdo_slab *slab, +vdo_modify_slab_reference_count(struct vdo_slab *slab, const struct journal_point *journal_point, struct reference_operation operation); -/** - * Acquire a provisional reference on behalf of a PBN lock if the block it - * locks is unreferenced. - * - * @param slab The slab which contains the block - * @param pbn The physical block to reference - * @param lock The lock - * - * @return VDO_SUCCESS or an error - **/ int __must_check vdo_acquire_provisional_reference(struct vdo_slab *slab, physical_block_number_t pbn, struct pbn_lock *lock); -/** - * Determine the index within the slab of a particular physical block number. - * - * @param [in] slab The slab - * @param [in] physical_block_number The physical block number - * @param [out] slab_block_number_ptr A pointer to the slab block number - * - * @return VDO_SUCCESS or an error code - **/ int __must_check vdo_slab_block_number_from_pbn(struct vdo_slab *slab, physical_block_number_t physical_block_number, slab_block_number *slab_block_number_ptr); -/** - * Check whether the reference counts for a given rebuilt slab should be saved. - * - * @param slab The slab to check - * - * @return true if the slab should be saved - **/ -bool __must_check should_save_fully_built_vdo_slab(const struct vdo_slab *slab); +bool __must_check vdo_should_save_fully_built_slab(const struct vdo_slab *slab); -/** - * Start an administrative operation on a slab. - * - * @param slab The slab to load - * @param operation The type of load to perform - * @param parent The object to notify when the operation is complete - **/ -void start_vdo_slab_action(struct vdo_slab *slab, +void vdo_start_slab_action(struct vdo_slab *slab, const struct admin_state_code *operation, struct vdo_completion *parent); -/** - * Inform a slab that its journal has been loaded. - * - * @param slab The slab whose journal has been loaded - * @param result The result of the load operation - **/ -void notify_vdo_slab_journal_is_loaded(struct vdo_slab *slab, int result); +void vdo_notify_slab_journal_is_loaded(struct vdo_slab *slab, int result); -/** - * Check whether a slab is open, i.e. is neither quiescent nor quiescing. - * - * @param slab The slab to check - * - * @return true if the slab is open - **/ -bool __must_check is_vdo_slab_open(struct vdo_slab *slab); +bool __must_check vdo_is_slab_open(struct vdo_slab *slab); -/** - * Check whether a slab is currently draining. - * - * @param slab The slab to check - * - * @return true if the slab is performing a drain operation - **/ -bool __must_check is_vdo_slab_draining(struct vdo_slab *slab); +bool __must_check vdo_is_slab_draining(struct vdo_slab *slab); -/** - * Check whether a slab has drained, and if so, send a notification thereof. - * - * @param slab The slab to check - **/ -void check_if_vdo_slab_drained(struct vdo_slab *slab); +void vdo_check_if_slab_drained(struct vdo_slab *slab); -/** - * Inform a slab that its ref_counts have finished draining. - * - * @param slab The slab whose ref_counts object has been drained - * @param result The result of the drain operation - **/ -void notify_vdo_slab_ref_counts_are_drained(struct vdo_slab *slab, int result); +void vdo_notify_slab_ref_counts_are_drained(struct vdo_slab *slab, int result); -/** - * Check whether a slab is currently resuming. - * - * @param slab The slab to check - * - * @return true if the slab is performing a resume operation - **/ -bool __must_check is_vdo_slab_resuming(struct vdo_slab *slab); +bool __must_check vdo_is_slab_resuming(struct vdo_slab *slab); -/** - * Finish scrubbing a slab now that it has been rebuilt by updating its status, - * queueing it for allocation, and reopening its journal. - * - * @param slab The slab whose reference counts have been rebuilt from its - * journal - **/ -void finish_scrubbing_vdo_slab(struct vdo_slab *slab); +void vdo_finish_scrubbing_slab(struct vdo_slab *slab); -/** - * Dump information about a slab to the log for debugging. - * - * @param slab The slab to dump - **/ -void dump_vdo_slab(const struct vdo_slab *slab); +void vdo_dump_slab(const struct vdo_slab *slab); -#endif // VDO_SLAB_H +#endif /* VDO_SLAB_H */ diff --git a/vdo/slabDepot.c b/vdo/slabDepot.c deleted file mode 100644 index 3135da11..00000000 --- a/vdo/slabDepot.c +++ /dev/null @@ -1,883 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/slabDepot.c#44 $ - */ - -#include "slabDepot.h" - -#include - -#include "logger.h" -#include "memoryAlloc.h" -#include "permassert.h" - - -#include "actionManager.h" -#include "adminState.h" -#include "blockAllocatorInternals.h" -#include "completion.h" -#include "constants.h" -#include "header.h" -#include "numUtils.h" -#include "readOnlyNotifier.h" -#include "refCounts.h" -#include "slab.h" -#include "slabDepotFormat.h" -#include "slabDepotInternals.h" -#include "slabJournal.h" -#include "slabIterator.h" -#include "slabSummary.h" -#include "statusCodes.h" -#include "threadConfig.h" -#include "types.h" -#include "vdo.h" -#include "vdoInternal.h" -#include "vdoState.h" - -/**********************************************************************/ -static -slab_count_t vdo_calculate_slab_count(struct slab_depot *depot) -{ - return compute_vdo_slab_count(depot->first_block, depot->last_block, - depot->slab_size_shift); -} - -/** - * Get an iterator over all the slabs in the depot. - * - * @param depot The depot - * - * @return An iterator over the depot's slabs - **/ -static struct slab_iterator get_slab_iterator(struct slab_depot *depot) -{ - return vdo_iterate_slabs(depot->slabs, depot->slab_count - 1, 0, 1); -} - -/** - * Allocate a new slab pointer array. Any existing slab pointers will be - * copied into the new array, and slabs will be allocated as needed. The - * newly allocated slabs will not be distributed for use by the block - * allocators. - * - * @param depot The depot - * @param slab_count The number of slabs the depot should have in the new - * array - * - * @return VDO_SUCCESS or an error code - **/ -static int allocate_slabs(struct slab_depot *depot, slab_count_t slab_count) -{ - block_count_t slab_size; - bool resizing = false; - physical_block_number_t slab_origin; - block_count_t translation; - - int result = UDS_ALLOCATE(slab_count, - struct vdo_slab *, - "slab pointer array", - &depot->new_slabs); - if (result != VDO_SUCCESS) { - return result; - } - - if (depot->slabs != NULL) { - memcpy(depot->new_slabs, - depot->slabs, - depot->slab_count * sizeof(struct vdo_slab *)); - resizing = true; - } - - slab_size = get_vdo_slab_config(depot)->slab_blocks; - slab_origin = depot->first_block + (depot->slab_count * slab_size); - - // The translation between allocator partition PBNs and layer PBNs. - translation = depot->origin - depot->first_block; - depot->new_slab_count = depot->slab_count; - while (depot->new_slab_count < slab_count) { - struct block_allocator *allocator = - depot->allocators[depot->new_slab_count % - depot->zone_count]; - struct vdo_slab **slab_ptr = - &depot->new_slabs[depot->new_slab_count]; - result = make_vdo_slab(slab_origin, - allocator, - translation, - depot->vdo->recovery_journal, - depot->new_slab_count, - resizing, - slab_ptr); - if (result != VDO_SUCCESS) { - return result; - } - // Increment here to ensure that vdo_abandon_new_slabs will - // clean up correctly. - depot->new_slab_count++; - - slab_origin += slab_size; - } - - return VDO_SUCCESS; -} - -/**********************************************************************/ -void vdo_abandon_new_slabs(struct slab_depot *depot) -{ - slab_count_t i; - - if (depot->new_slabs == NULL) { - return; - } - - for (i = depot->slab_count; i < depot->new_slab_count; i++) { - free_vdo_slab(UDS_FORGET(depot->new_slabs[i])); - } - depot->new_slab_count = 0; - depot->new_size = 0; - UDS_FREE(UDS_FORGET(depot->new_slabs)); -} - -/** - * Get the ID of the thread on which a given allocator operates. - * - *

Implements vdo_zone_thread_getter. - **/ -static thread_id_t get_allocator_thread_id(void *context, - zone_count_t zone_number) -{ - return vdo_get_block_allocator_for_zone(context, zone_number)->thread_id; -} - -/** - * Prepare to commit oldest tail blocks. - * - *

Implements vdo_action_preamble. - **/ -static void prepare_for_tail_block_commit(void *context, - struct vdo_completion *parent) -{ - struct slab_depot *depot = context; - depot->active_release_request = depot->new_release_request; - complete_vdo_completion(parent); -} - -/** - * Schedule a tail block commit if necessary. This method should not be called - * directly. Rather, call schedule_vdo_default_action() on the depot's action - * manager. - * - *

Implements vdo_action_scheduler. - **/ -static bool schedule_tail_block_commit(void *context) -{ - struct slab_depot *depot = context; - if (depot->new_release_request == depot->active_release_request) { - return false; - } - - return schedule_vdo_action(depot->action_manager, - prepare_for_tail_block_commit, - release_vdo_tail_block_locks, - NULL, - NULL); -} - -/** - * Allocate those components of the slab depot which are needed only at load - * time, not at format time. - * - * @param depot The depot - * @param summary_partition The partition which holds the slab summary - * - * @return VDO_SUCCESS or an error - **/ -static int allocate_components(struct slab_depot *depot, - struct partition *summary_partition) -{ - zone_count_t zone; - slab_count_t slab_count, i; - const struct thread_config *thread_config = - get_vdo_thread_config(depot->vdo); - int result = - make_vdo_action_manager(depot->zone_count, - get_allocator_thread_id, - thread_config->journal_thread, - depot, - schedule_tail_block_commit, - depot->vdo, - &depot->action_manager); - if (result != VDO_SUCCESS) { - return result; - } - - depot->origin = depot->first_block; - - result = make_vdo_slab_summary(depot->vdo, - summary_partition, - thread_config, - depot->slab_size_shift, - depot->slab_config.data_blocks, - depot->vdo->read_only_notifier, - &depot->slab_summary); - if (result != VDO_SUCCESS) { - return result; - } - - slab_count = vdo_calculate_slab_count(depot); - if (thread_config->physical_zone_count > slab_count) { - return uds_log_error_strerror(VDO_BAD_CONFIGURATION, - "%u physical zones exceeds slab count %u", - thread_config->physical_zone_count, - slab_count); - } - - // Allocate the block allocators. - for (zone = 0; zone < depot->zone_count; zone++) { - thread_id_t thread_id = - vdo_get_physical_zone_thread(thread_config, zone); - result = make_vdo_block_allocator(depot, - zone, - thread_id, - depot->vdo->states.vdo.nonce, - VIO_POOL_SIZE, - depot->vdo, - depot->vdo->read_only_notifier, - &depot->allocators[zone]); - if (result != VDO_SUCCESS) { - return result; - } - } - - // Allocate slabs. - result = allocate_slabs(depot, slab_count); - if (result != VDO_SUCCESS) { - return result; - } - - // Use the new slabs. - for (i = depot->slab_count; i < depot->new_slab_count; i++) { - struct vdo_slab *slab = depot->new_slabs[i]; - register_vdo_slab_with_allocator(slab->allocator, slab); - WRITE_ONCE(depot->slab_count, depot->slab_count + 1); - } - - depot->slabs = depot->new_slabs; - depot->new_slabs = NULL; - depot->new_slab_count = 0; - - return VDO_SUCCESS; -} - -/**********************************************************************/ -int decode_vdo_slab_depot(struct slab_depot_state_2_0 state, - struct vdo *vdo, - struct partition *summary_partition, - struct slab_depot **depot_ptr) -{ - unsigned int slab_size_shift; - struct slab_depot *depot; - int result; - const struct thread_config *thread_config = get_vdo_thread_config(vdo); - - // Calculate the bit shift for efficiently mapping block numbers to - // slabs. Using a shift requires that the slab size be a power of two. - block_count_t slab_size = state.slab_config.slab_blocks; - if (!is_power_of_2(slab_size)) { - return uds_log_error_strerror(UDS_INVALID_ARGUMENT, - "slab size must be a power of two"); - } - slab_size_shift = log_base_two(slab_size); - - result = UDS_ALLOCATE_EXTENDED(struct slab_depot, - thread_config->physical_zone_count, - struct block_allocator *, - __func__, - &depot); - if (result != VDO_SUCCESS) { - return result; - } - - depot->vdo = vdo; - depot->old_zone_count = state.zone_count; - depot->zone_count = thread_config->physical_zone_count; - depot->slab_config = state.slab_config; - depot->first_block = state.first_block; - depot->last_block = state.last_block; - depot->slab_size_shift = slab_size_shift; - - result = allocate_components(depot, summary_partition); - if (result != VDO_SUCCESS) { - free_vdo_slab_depot(depot); - return result; - } - - *depot_ptr = depot; - return VDO_SUCCESS; -} - -/**********************************************************************/ -void free_vdo_slab_depot(struct slab_depot *depot) -{ - zone_count_t zone = 0; - - if (depot == NULL) { - return; - } - - vdo_abandon_new_slabs(depot); - - for (zone = 0; zone < depot->zone_count; zone++) { - free_vdo_block_allocator(UDS_FORGET(depot->allocators[zone])); - } - - if (depot->slabs != NULL) { - slab_count_t i; - for (i = 0; i < depot->slab_count; i++) { - free_vdo_slab(UDS_FORGET(depot->slabs[i])); - } - } - - UDS_FREE(UDS_FORGET(depot->slabs)); - UDS_FREE(UDS_FORGET(depot->action_manager)); - free_vdo_slab_summary(UDS_FORGET(depot->slab_summary)); - UDS_FREE(depot); -} - -/**********************************************************************/ -struct slab_depot_state_2_0 record_vdo_slab_depot(const struct slab_depot *depot) -{ - /* - * If this depot is currently using 0 zones, it must have been - * synchronously loaded by a tool and is now being saved. We - * did not load and combine the slab summary, so we still need - * to do that next time we load with the old zone count rather - * than 0. - */ - struct slab_depot_state_2_0 state; - zone_count_t zones_to_record = depot->zone_count; - if (depot->zone_count == 0) { - zones_to_record = depot->old_zone_count; - } - - state = (struct slab_depot_state_2_0) { - .slab_config = depot->slab_config, - .first_block = depot->first_block, - .last_block = depot->last_block, - .zone_count = zones_to_record, - }; - - return state; -} - -/**********************************************************************/ -int vdo_allocate_slab_ref_counts(struct slab_depot *depot) -{ - struct slab_iterator iterator = get_slab_iterator(depot); - while (vdo_has_next_slab(&iterator)) { - int result = - allocate_ref_counts_for_vdo_slab(vdo_next_slab(&iterator)); - if (result != VDO_SUCCESS) { - return result; - } - } - - return VDO_SUCCESS; -} - -/**********************************************************************/ -struct block_allocator *vdo_get_block_allocator_for_zone(struct slab_depot *depot, - zone_count_t zone_number) -{ - return depot->allocators[zone_number]; -} - -/**********************************************************************/ -static -int vdo_get_slab_number(const struct slab_depot *depot, - physical_block_number_t pbn, - slab_count_t *slab_number_ptr) -{ - slab_count_t slab_number; - if (pbn < depot->first_block) { - return VDO_OUT_OF_RANGE; - } - - slab_number = (pbn - depot->first_block) >> depot->slab_size_shift; - if (slab_number >= depot->slab_count) { - return VDO_OUT_OF_RANGE; - } - - *slab_number_ptr = slab_number; - return VDO_SUCCESS; -} - -/**********************************************************************/ -struct vdo_slab *get_vdo_slab(const struct slab_depot *depot, - physical_block_number_t pbn) -{ - slab_count_t slab_number; - int result; - - if (pbn == VDO_ZERO_BLOCK) { - return NULL; - } - - result = vdo_get_slab_number(depot, pbn, &slab_number); - if (result != VDO_SUCCESS) { - vdo_enter_read_only_mode(depot->vdo->read_only_notifier, result); - return NULL; - } - - return depot->slabs[slab_number]; -} - -/**********************************************************************/ -struct slab_journal *get_vdo_slab_journal(const struct slab_depot *depot, - physical_block_number_t pbn) -{ - struct vdo_slab *slab = get_vdo_slab(depot, pbn); - return ((slab != NULL) ? slab->journal : NULL); -} - -/**********************************************************************/ -uint8_t vdo_get_increment_limit(struct slab_depot *depot, - physical_block_number_t pbn) -{ - struct vdo_slab *slab = get_vdo_slab(depot, pbn); - if ((slab == NULL) || is_unrecovered_vdo_slab(slab)) { - return 0; - } - - return vdo_get_available_references(slab->reference_counts, pbn); -} - -/**********************************************************************/ -bool vdo_is_physical_data_block(const struct slab_depot *depot, - physical_block_number_t pbn) -{ - slab_count_t slab_number; - slab_block_number sbn; - int result; - - if (pbn == VDO_ZERO_BLOCK) { - return true; - } - - if (vdo_get_slab_number(depot, pbn, &slab_number) != VDO_SUCCESS) { - return false; - } - - result = vdo_slab_block_number_from_pbn(depot->slabs[slab_number], - pbn, &sbn); - return (result == VDO_SUCCESS); -} - -/**********************************************************************/ -block_count_t get_vdo_slab_depot_allocated_blocks(const struct slab_depot *depot) -{ - block_count_t total = 0; - zone_count_t zone; - for (zone = 0; zone < depot->zone_count; zone++) { - // The allocators are responsible for thread safety. - total += get_vdo_allocated_blocks(depot->allocators[zone]); - } - return total; -} - -/**********************************************************************/ -block_count_t get_vdo_slab_depot_data_blocks(const struct slab_depot *depot) -{ - return (READ_ONCE(depot->slab_count) * depot->slab_config.data_blocks); -} - -/**********************************************************************/ -block_count_t get_vdo_slab_depot_free_blocks(const struct slab_depot *depot) -{ - /* - * We can't ever shrink a volume except when resize fails, and we - * can't allocate from the new slabs until after the resize succeeds, - * so by getting the number of allocated blocks first, we ensure the - * allocated count is always less than the capacity. Doing it in the - * other order on a full volume could lose a race with a successful - * resize, resulting in a nonsensical negative/underflow result. - */ - block_count_t allocated = get_vdo_slab_depot_allocated_blocks(depot); - smp_mb(); - return (get_vdo_slab_depot_data_blocks(depot) - allocated); -} - -/**********************************************************************/ -static -slab_count_t get_vdo_slab_depot_unrecovered_slab_count(const struct slab_depot *depot) -{ - slab_count_t total = 0; - zone_count_t zone; - for (zone = 0; zone < depot->zone_count; zone++) { - // The allocators are responsible for thread safety. - total += get_vdo_unrecovered_slab_count(depot->allocators[zone]); - } - return total; -} - -/** - * The preamble of a load operation which loads the slab summary. - * - *

Implements vdo_action_preamble. - **/ -static void start_depot_load(void *context, struct vdo_completion *parent) -{ - struct slab_depot *depot = context; - load_vdo_slab_summary(depot->slab_summary, - get_current_vdo_manager_operation(depot->action_manager), - depot->old_zone_count, - parent); -} - -/**********************************************************************/ -void load_vdo_slab_depot(struct slab_depot *depot, - const struct admin_state_code *operation, - struct vdo_completion *parent, - void *context) -{ - if (assert_vdo_load_operation(operation, parent)) { - schedule_vdo_operation_with_context(depot->action_manager, - operation, - start_depot_load, - load_vdo_block_allocator, - NULL, - context, - parent); - } -} - -/**********************************************************************/ -void prepare_vdo_slab_depot_to_allocate(struct slab_depot *depot, - enum slab_depot_load_type load_type, - struct vdo_completion *parent) -{ - depot->load_type = load_type; - atomic_set(&depot->zones_to_scrub, depot->zone_count); - schedule_vdo_action(depot->action_manager, - NULL, - prepare_vdo_block_allocator_to_allocate, - NULL, - parent); -} - -/**********************************************************************/ -void update_vdo_slab_depot_size(struct slab_depot *depot) -{ - depot->last_block = depot->new_last_block; -} - -/**********************************************************************/ -int vdo_prepare_to_grow_slab_depot(struct slab_depot *depot, block_count_t new_size) -{ - struct slab_depot_state_2_0 new_state; - int result; - slab_count_t new_slab_count; - - if ((new_size >> depot->slab_size_shift) <= depot->slab_count) { - return VDO_INCREMENT_TOO_SMALL; - } - - // Generate the depot configuration for the new block count. - result = configure_vdo_slab_depot(new_size, - depot->first_block, - depot->slab_config, - depot->zone_count, - &new_state); - if (result != VDO_SUCCESS) { - return result; - } - - new_slab_count = compute_vdo_slab_count(depot->first_block, - new_state.last_block, - depot->slab_size_shift); - if (new_slab_count <= depot->slab_count) { - return uds_log_error_strerror(VDO_INCREMENT_TOO_SMALL, - "Depot can only grow"); - } - if (new_slab_count == depot->new_slab_count) { - // Check it out, we've already got all the new slabs allocated! - return VDO_SUCCESS; - } - - vdo_abandon_new_slabs(depot); - result = allocate_slabs(depot, new_slab_count); - if (result != VDO_SUCCESS) { - vdo_abandon_new_slabs(depot); - return result; - } - - depot->new_size = new_size; - depot->old_last_block = depot->last_block; - depot->new_last_block = new_state.last_block; - - return VDO_SUCCESS; -} - -/** - * Finish registering new slabs now that all of the allocators have received - * their new slabs. - * - *

Implements vdo_action_conclusion. - **/ -static int finish_registration(void *context) -{ - struct slab_depot *depot = context; - WRITE_ONCE(depot->slab_count, depot->new_slab_count); - UDS_FREE(depot->slabs); - depot->slabs = depot->new_slabs; - depot->new_slabs = NULL; - depot->new_slab_count = 0; - return VDO_SUCCESS; -} - -/**********************************************************************/ -void vdo_use_new_slabs(struct slab_depot *depot, struct vdo_completion *parent) -{ - ASSERT_LOG_ONLY(depot->new_slabs != NULL, "Must have new slabs to use"); - schedule_vdo_operation(depot->action_manager, - VDO_ADMIN_STATE_SUSPENDED_OPERATION, - NULL, - register_new_vdo_slabs_for_allocator, - finish_registration, - parent); -} - -/**********************************************************************/ -void drain_vdo_slab_depot(struct slab_depot *depot, - const struct admin_state_code *operation, - struct vdo_completion *parent) -{ - schedule_vdo_operation(depot->action_manager, - operation, - NULL, - drain_vdo_block_allocator, - NULL, - parent); -} - -/**********************************************************************/ -void resume_vdo_slab_depot(struct slab_depot *depot, struct vdo_completion *parent) -{ - if (vdo_is_read_only(depot->vdo->read_only_notifier)) { - finish_vdo_completion(parent, VDO_READ_ONLY); - return; - } - - schedule_vdo_operation(depot->action_manager, - VDO_ADMIN_STATE_RESUMING, - NULL, - resume_vdo_block_allocator, - NULL, - parent); -} - -/**********************************************************************/ -void -vdo_commit_oldest_slab_journal_tail_blocks(struct slab_depot *depot, - sequence_number_t recovery_block_number) -{ - if (depot == NULL) { - return; - } - - depot->new_release_request = recovery_block_number; - schedule_vdo_default_action(depot->action_manager); -} - -/**********************************************************************/ -const struct slab_config *get_vdo_slab_config(const struct slab_depot *depot) -{ - return &depot->slab_config; -} - -/**********************************************************************/ -struct slab_summary *get_vdo_slab_summary(const struct slab_depot *depot) -{ - return depot->slab_summary; -} - -/**********************************************************************/ -struct slab_summary_zone * -get_vdo_slab_summary_for_zone(const struct slab_depot *depot, zone_count_t zone) -{ - if (depot->slab_summary == NULL) { - return NULL; - } - - return vdo_get_slab_summary_for_zone(depot->slab_summary, zone); -} - -/**********************************************************************/ -void vdo_scrub_all_unrecovered_slabs(struct slab_depot *depot, - struct vdo_completion *parent) -{ - schedule_vdo_action(depot->action_manager, - NULL, - scrub_all_unrecovered_vdo_slabs_in_zone, - NULL, - parent); -} - -/**********************************************************************/ -void vdo_notify_zone_finished_scrubbing(struct vdo_completion *completion) -{ - enum vdo_state prior_state; - - struct slab_depot *depot = completion->parent; - if (atomic_add_return(-1, &depot->zones_to_scrub) > 0) { - return; - } - - // We're the last! - prior_state = atomic_cmpxchg(&depot->vdo->state, - VDO_RECOVERING, VDO_DIRTY); - // To be safe, even if the CAS failed, ensure anything that follows is - // ordered with respect to whatever state change did happen. - smp_mb__after_atomic(); - - /* - * We must check the VDO state here and not the depot's - * read_only_notifier since the compare-swap-above could have - * failed due to a read-only entry which our own thread does not - * yet know about. - */ - if (prior_state == VDO_DIRTY) { - uds_log_info("VDO commencing normal operation"); - } else if (prior_state == VDO_RECOVERING) { - uds_log_info("Exiting recovery mode"); - } -} - -/**********************************************************************/ -block_count_t get_vdo_slab_depot_new_size(const struct slab_depot *depot) -{ - return (depot->new_slabs == NULL) ? 0 : depot->new_size; -} - - -/** - * Get the total of the statistics from all the block allocators in the depot. - * - * @param depot The slab depot - * - * @return The statistics from all block allocators in the depot - **/ -static struct block_allocator_statistics __must_check -get_depot_block_allocator_statistics(const struct slab_depot *depot) -{ - struct block_allocator_statistics totals; - zone_count_t zone; - memset(&totals, 0, sizeof(totals)); - - for (zone = 0; zone < depot->zone_count; zone++) { - struct block_allocator *allocator = depot->allocators[zone]; - struct block_allocator_statistics stats = - get_vdo_block_allocator_statistics(allocator); - totals.slab_count += stats.slab_count; - totals.slabs_opened += stats.slabs_opened; - totals.slabs_reopened += stats.slabs_reopened; - } - - return totals; -} - -/** - * Get the cumulative ref_counts statistics for the depot. - * - * @param depot The slab depot - * - * @return The cumulative statistics for all ref_counts in the depot - **/ -static struct ref_counts_statistics __must_check -get_depot_ref_counts_statistics(const struct slab_depot *depot) -{ - struct ref_counts_statistics depot_stats; - zone_count_t zone; - memset(&depot_stats, 0, sizeof(depot_stats)); - - for (zone = 0; zone < depot->zone_count; zone++) { - struct block_allocator *allocator = depot->allocators[zone]; - struct ref_counts_statistics stats = - get_vdo_ref_counts_statistics(allocator); - depot_stats.blocks_written += stats.blocks_written; - } - - return depot_stats; -} - -/** - * Get the aggregated slab journal statistics for the depot. - * - * @param depot The slab depot - * - * @return The aggregated statistics for all slab journals in the depot - **/ -static struct slab_journal_statistics __must_check -get_depot_slab_journal_statistics(const struct slab_depot *depot) -{ - struct slab_journal_statistics depot_stats; - zone_count_t zone; - memset(&depot_stats, 0, sizeof(depot_stats)); - - for (zone = 0; zone < depot->zone_count; zone++) { - struct block_allocator *allocator = depot->allocators[zone]; - struct slab_journal_statistics stats = - get_vdo_slab_journal_statistics(allocator); - depot_stats.disk_full_count += stats.disk_full_count; - depot_stats.flush_count += stats.flush_count; - depot_stats.blocked_count += stats.blocked_count; - depot_stats.blocks_written += stats.blocks_written; - depot_stats.tail_busy_count += stats.tail_busy_count; - } - - return depot_stats; -} - -/**********************************************************************/ -void get_vdo_slab_depot_statistics(const struct slab_depot *depot, - struct vdo_statistics *stats) -{ - slab_count_t slab_count = READ_ONCE(depot->slab_count); - slab_count_t unrecovered = - get_vdo_slab_depot_unrecovered_slab_count(depot); - - stats->recovery_percentage = - (slab_count - unrecovered) * 100 / slab_count; - stats->allocator = get_depot_block_allocator_statistics(depot); - stats->ref_counts = get_depot_ref_counts_statistics(depot); - stats->slab_journal = get_depot_slab_journal_statistics(depot); - stats->slab_summary = - get_vdo_slab_summary_statistics(depot->slab_summary); -} - -/**********************************************************************/ -void dump_vdo_slab_depot(const struct slab_depot *depot) -{ - uds_log_info("vdo slab depot"); - uds_log_info(" zone_count=%u old_zone_count=%u slabCount=%u active_release_request=%llu new_release_request=%llu", - (unsigned int) depot->zone_count, - (unsigned int) depot->old_zone_count, - READ_ONCE(depot->slab_count), - (unsigned long long) depot->active_release_request, - (unsigned long long) depot->new_release_request); -} diff --git a/vdo/slabDepot.h b/vdo/slabDepot.h deleted file mode 100644 index 3b13fb3e..00000000 --- a/vdo/slabDepot.h +++ /dev/null @@ -1,357 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/slabDepot.h#19 $ - */ - -#ifndef SLAB_DEPOT_H -#define SLAB_DEPOT_H - -#include "adminState.h" -#include "fixedLayout.h" -#include "slabDepotFormat.h" -#include "statistics.h" -#include "types.h" - -/** - * A slab_depot is responsible for managing all of the slabs and block - * allocators of a VDO. It has a single array of slabs in order to eliminate - * the need for additional math in order to compute which physical zone a PBN - * is in. It also has a block_allocator per zone. - * - * Load operations are required to be performed on a single thread. Normal - * operations are assumed to be performed in the appropriate zone. Allocations - * and reference count updates must be done from the thread of their physical - * zone. Requests to commit slab journal tail blocks from the recovery journal - * must be done on the journal zone thread. Save operations are required to be - * launched from the same thread as the original load operation. - **/ - -enum slab_depot_load_type { - VDO_SLAB_DEPOT_NORMAL_LOAD, - VDO_SLAB_DEPOT_RECOVERY_LOAD, - VDO_SLAB_DEPOT_REBUILD_LOAD -}; - - -/** - * Make a slab depot and configure it with the state read from the super block. - * - * @param [in] state The slab depot state from the super block - * @param [in] vdo The VDO which will own the depot - * @param [in] summary_partition The partition which holds the slab summary - * @param [out] depot_ptr A pointer to hold the depot - * - * @return A success or error code - **/ -int __must_check -decode_vdo_slab_depot(struct slab_depot_state_2_0 state, - struct vdo *vdo, - struct partition *summary_partition, - struct slab_depot **depot_ptr); - -/** - * Destroy a slab depot. - * - * @param depot The depot to destroy - **/ -void free_vdo_slab_depot(struct slab_depot *depot); - -/** - * Record the state of a slab depot for encoding into the super block. - * - * @param depot The depot to encode - * - * @return The depot state - **/ -struct slab_depot_state_2_0 __must_check -record_vdo_slab_depot(const struct slab_depot *depot); - -/** - * Allocate the ref_counts for all slabs in the depot. This method may be - * called only before entering normal operation from the load thread. - * - * @param depot The depot whose ref_counts need allocation - * - * @return VDO_SUCCESS or an error - **/ -int __must_check vdo_allocate_slab_ref_counts(struct slab_depot *depot); - -/** - * Get the block allocator for a specified physical zone from a depot. - * - * @param depot The depot - * @param zone_number The physical zone - * - * @return The block allocator for the specified zone - **/ -struct block_allocator * __must_check -vdo_get_block_allocator_for_zone(struct slab_depot *depot, - zone_count_t zone_number); - - -/** - * Get the slab object for the slab that contains a specified block. Will put - * the VDO in read-only mode if the PBN is not a valid data block nor the zero - * block. - * - * @param depot The slab depot - * @param pbn The physical block number - * - * @return The slab containing the block, or NULL if the block number is the - * zero block or otherwise out of range - **/ -struct vdo_slab * __must_check -get_vdo_slab(const struct slab_depot *depot, physical_block_number_t pbn); - -/** - * Get the slab journal for the slab that contains a specified block. - * - * @param depot The slab depot - * @param pbn The physical block number within the block depot partition - * of any block in the slab - * - * @return The slab journal of the slab containing the block, or NULL if the - * block number is for the zero block or otherwise out of range - **/ -struct slab_journal * __must_check -get_vdo_slab_journal(const struct slab_depot *depot, physical_block_number_t pbn); - -/** - * Determine how many new references a block can acquire. This method must be - * called from the the physical zone thread of the PBN. - * - * @param depot The slab depot - * @param pbn The physical block number that is being queried - * - * @return the number of available references - **/ -uint8_t __must_check -vdo_get_increment_limit(struct slab_depot *depot, physical_block_number_t pbn); - -/** - * Determine whether the given PBN refers to a data block. - * - * @param depot The depot - * @param pbn The physical block number to ask about - * - * @return True if the PBN corresponds to a data block - **/ -bool __must_check -vdo_is_physical_data_block(const struct slab_depot *depot, - physical_block_number_t pbn); - -/** - * Get the total number of data blocks allocated across all the slabs in the - * depot, which is the total number of blocks with a non-zero reference count. - * This may be called from any thread. - * - * @param depot The slab depot - * - * @return The total number of blocks with a non-zero reference count - **/ -block_count_t __must_check -get_vdo_slab_depot_allocated_blocks(const struct slab_depot *depot); - -/** - * Get the total number of data blocks in all the slabs in the depot. This may - * be called from any thread. - * - * @param depot The slab depot - * - * @return The total number of data blocks in all slabs - **/ -block_count_t __must_check -get_vdo_slab_depot_data_blocks(const struct slab_depot *depot); - -/** - * Get the total number of free blocks remaining in all the slabs in the - * depot, which is the total number of blocks that have a zero reference - * count. This may be called from any thread. - * - * @param depot The slab depot - * - * @return The total number of blocks with a zero reference count - **/ -block_count_t __must_check -get_vdo_slab_depot_free_blocks(const struct slab_depot *depot); - -/** - * Get all the vdo_statistics fields that are properties of the slab depot. - * - * @param depot The slab depot - * @param stats The vdo statistics structure to partially fill - **/ -void get_vdo_slab_depot_statistics(const struct slab_depot *depot, - struct vdo_statistics *stats); - - -/** - * Asynchronously load any slab depot state that isn't included in the - * super_block component. This method may be called only before entering - * normal operation from the load thread. - * - * @param depot The depot to load - * @param operation The type of load to perform - * @param parent The completion to finish when the load is complete - * @param context Additional context for the load operation; may be NULL - **/ -void load_vdo_slab_depot(struct slab_depot *depot, - const struct admin_state_code *operation, - struct vdo_completion *parent, - void *context); - -/** - * Prepare the slab depot to come online and start allocating blocks. This - * method may be called only before entering normal operation from the load - * thread. It must be called before allocation may proceed. - * - * @param depot The depot to prepare - * @param load_type The load type - * @param parent The completion to finish when the operation is complete - **/ -void prepare_vdo_slab_depot_to_allocate(struct slab_depot *depot, - enum slab_depot_load_type load_type, - struct vdo_completion *parent); - -/** - * Update the slab depot to reflect its new size in memory. This size is saved - * to disk as part of the super block. - * - * @param depot The depot to update - **/ -void update_vdo_slab_depot_size(struct slab_depot *depot); - -/** - * Allocate new memory needed for a resize of a slab depot to the given size. - * - * @param depot The depot to prepare to resize - * @param new_size The number of blocks in the new depot - * - * @return VDO_SUCCESS or an error - **/ -int __must_check -vdo_prepare_to_grow_slab_depot(struct slab_depot *depot, block_count_t new_size); - -/** - * Use the new slabs allocated for resize. - * - * @param depot The depot - * @param parent The object to notify when complete - **/ -void vdo_use_new_slabs(struct slab_depot *depot, struct vdo_completion *parent); - -/** - * Abandon any new slabs in this depot, freeing them as needed. - * - * @param depot The depot - **/ -void vdo_abandon_new_slabs(struct slab_depot *depot); - -/** - * Drain all slab depot I/O. If saving, or flushing, all dirty depot metadata - * will be written out. If saving or suspending, the depot will be left in a - * suspended state. - * - * @param depot The depot to drain - * @param operation The drain operation (flush, rebuild, suspend, or save) - * @param parent The completion to finish when the drain is complete - **/ -void drain_vdo_slab_depot(struct slab_depot *depot, - const struct admin_state_code *operation, - struct vdo_completion *parent); - -/** - * Resume a suspended slab depot. - * - * @param depot The depot to resume - * @param parent The completion to finish when the depot has resumed - **/ -void resume_vdo_slab_depot(struct slab_depot *depot, struct vdo_completion *parent); - -/** - * Commit all dirty tail blocks which are locking a given recovery journal - * block. This method must be called from the journal zone thread. - * - * @param depot The depot - * @param recovery_block_number The sequence number of the recovery journal - * block whose locks should be released - **/ -void -vdo_commit_oldest_slab_journal_tail_blocks(struct slab_depot *depot, - sequence_number_t recovery_block_number); - -/** - * Get the slab_config of a depot. - * - * @param depot The slab depot - * - * @return The slab configuration of the specified depot - **/ -const struct slab_config * __must_check -get_vdo_slab_config(const struct slab_depot *depot); - -/** - * Get the slab summary. - * - * @param depot The slab depot - * - * @return The slab summary - **/ -struct slab_summary * __must_check -get_vdo_slab_summary(const struct slab_depot *depot); - -/** - * Get the portion of the slab summary for a given physical zone. - * - * @param depot The slab depot - * @param zone The zone - * - * @return The portion of the slab summary for the specified zone - **/ -struct slab_summary_zone * __must_check -get_vdo_slab_summary_for_zone(const struct slab_depot *depot, zone_count_t zone); - -/** - * Scrub all unrecovered slabs. - * - * @param depot The depot to scrub - * @param parent The object to notify when scrubbing has been launched - * for all zones - **/ -void vdo_scrub_all_unrecovered_slabs(struct slab_depot *depot, - struct vdo_completion *parent); - -/** - * Get the physical size to which this depot is prepared to grow. - * - * @param depot The slab depot - * - * @return The new number of blocks the depot will be grown to, or 0 if the - * depot is not prepared to grow - **/ -block_count_t __must_check get_vdo_slab_depot_new_size(const struct slab_depot *depot); - -/** - * Dump the slab depot, in a thread-unsafe fashion. - * - * @param depot The slab depot - **/ -void dump_vdo_slab_depot(const struct slab_depot *depot); - -#endif // SLAB_DEPOT_H diff --git a/vdo/slabDepotFormat.h b/vdo/slabDepotFormat.h deleted file mode 100644 index e8a11b16..00000000 --- a/vdo/slabDepotFormat.h +++ /dev/null @@ -1,128 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/slabDepotFormat.h#7 $ - */ - -#ifndef SLAB_DEPOT_FORMAT_H -#define SLAB_DEPOT_FORMAT_H - -#include "buffer.h" - -#include "header.h" -#include "types.h" - -struct slab_depot_state_2_0 { - struct slab_config slab_config; - physical_block_number_t first_block; - physical_block_number_t last_block; - zone_count_t zone_count; -} __packed; - -extern const struct header VDO_SLAB_DEPOT_HEADER_2_0; - -/** - * Compute the number of slabs a depot with given parameters would have. - * - * @param first_block PBN of the first data block - * @param last_block PBN of the last data block - * @param slab_size_shift Exponent for the number of blocks per slab - * - * @return The number of slabs - **/ -slab_count_t __must_check -compute_vdo_slab_count(physical_block_number_t first_block, - physical_block_number_t last_block, - unsigned int slab_size_shift); - -/** - * Get the size of the encoded state of a slab depot. - * - * @return The encoded size of the depot's state - **/ -size_t __must_check get_vdo_slab_depot_encoded_size(void); - -/** - * Encode the state of a slab depot into a buffer. - * - * @param state The state to encode - * @param buffer The buffer to encode into - * - * @return UDS_SUCCESS or an error - **/ -int __must_check -encode_vdo_slab_depot_state_2_0(struct slab_depot_state_2_0 state, - struct buffer *buffer); - -/** - * Decode slab depot component state version 2.0 from a buffer. - * - * @param buffer A buffer positioned at the start of the encoding - * @param state The state structure to receive the decoded values - * - * @return UDS_SUCCESS or an error code - **/ -int __must_check -decode_vdo_slab_depot_state_2_0(struct buffer *buffer, - struct slab_depot_state_2_0 *state); - -/** - * Configure the slab_depot for the specified storage capacity, finding the - * number of data blocks that will fit and still leave room for the depot - * metadata, then return the saved state for that configuration. - * - * @param [in] block_count The number of blocks in the underlying storage - * @param [in] first_block The number of the first block that may be allocated - * @param [in] slab_config The configuration of a single slab - * @param [in] zone_count The number of zones the depot will use - * @param [out] state The state structure to be configured - * - * @return VDO_SUCCESS or an error code - **/ -int __must_check configure_vdo_slab_depot(block_count_t block_count, - physical_block_number_t first_block, - struct slab_config slab_config, - zone_count_t zone_count, - struct slab_depot_state_2_0 *state); - -/** - * Measure and initialize the configuration to use for each slab. - * - * @param [in] slab_size The number of blocks per slab - * @param [in] slab_journal_blocks The number of blocks for the slab journal - * @param [out] slab_config The slab configuration to initialize - * - * @return VDO_SUCCESS or an error code - **/ -int __must_check configure_vdo_slab(block_count_t slab_size, - block_count_t slab_journal_blocks, - struct slab_config *slab_config); - -/** - * Get the number of blocks required to save a reference counts state covering - * the specified number of data blocks. - * - * @param block_count The number of physical data blocks that can be referenced - * - * @return The number of blocks required to save reference counts with the - * given block count - **/ -block_count_t __must_check -vdo_get_saved_reference_count_size(block_count_t block_count); - -#endif // SLAB_DEPOT_FORMAT_H diff --git a/vdo/slabDepotInternals.h b/vdo/slabDepotInternals.h deleted file mode 100644 index 055adf04..00000000 --- a/vdo/slabDepotInternals.h +++ /dev/null @@ -1,88 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/slabDepotInternals.h#5 $ - */ - -#ifndef SLAB_DEPOT_INTERNALS_H -#define SLAB_DEPOT_INTERNALS_H - -#include "slabDepot.h" - -#include - -#include "types.h" - -struct slab_depot { - zone_count_t zone_count; - zone_count_t old_zone_count; - struct vdo *vdo; - struct slab_config slab_config; - struct slab_summary *slab_summary; - struct action_manager *action_manager; - - physical_block_number_t first_block; - physical_block_number_t last_block; - physical_block_number_t origin; - - /** slab_size == (1 << slab_size_shift) */ - unsigned int slab_size_shift; - - /** Determines how slabs should be queued during load */ - enum slab_depot_load_type load_type; - - /** The state for notifying slab journals to release recovery journal */ - sequence_number_t active_release_request; - sequence_number_t new_release_request; - - /** State variables for scrubbing complete handling */ - atomic_t zones_to_scrub; - - /** Array of pointers to individually allocated slabs */ - struct vdo_slab **slabs; - /** The number of slabs currently allocated and stored in 'slabs' */ - slab_count_t slab_count; - - /** Array of pointers to a larger set of slabs (used during resize) */ - struct vdo_slab **new_slabs; - /** The number of slabs currently allocated and stored in 'new_slabs' */ - slab_count_t new_slab_count; - /** The size that 'new_slabs' was allocated for */ - block_count_t new_size; - - /** The last block before resize, for rollback */ - physical_block_number_t old_last_block; - /** The last block after resize, for resize */ - physical_block_number_t new_last_block; - - /** The block allocators for this depot */ - struct block_allocator *allocators[]; -}; - -/** - * Notify a slab depot that one of its allocators has finished scrubbing slabs. - * This method should only be called if the scrubbing was successful. This - * callback is registered by each block allocator in - * scrub_all_unrecovered_vdo_slabs_in_zone(). - * - * @param completion A completion whose parent must be a slab depot - **/ -void vdo_notify_zone_finished_scrubbing(struct vdo_completion *completion); - - -#endif /* SLAB_DEPOT_INTERNALS_H */ diff --git a/vdo/slabIterator.h b/vdo/slabIterator.h deleted file mode 100644 index 10e5c8e4..00000000 --- a/vdo/slabIterator.h +++ /dev/null @@ -1,98 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/slabIterator.h#6 $ - */ - -#ifndef SLAB_ITERATOR_H -#define SLAB_ITERATOR_H - -#include "slab.h" -#include "types.h" - -/** - * A slab_iterator is a structure for iterating over a set of slabs. - **/ -struct slab_iterator { - struct vdo_slab **slabs; - struct vdo_slab *next; - slab_count_t end; - slab_count_t stride; -}; - -/** - * Return a slab_iterator initialized to iterate over an array of slabs - * with a given stride. Iteration always occurs from higher to lower numbered - * slabs. - * - * @param slabs The array of slabs - * @param start The number of the slab to start iterating from - * @param end The number of the last slab which may be returned - * @param stride The difference in slab number between successive slabs - * - * @return an initialized iterator structure - **/ -static inline struct slab_iterator vdo_iterate_slabs(struct vdo_slab **slabs, - slab_count_t start, - slab_count_t end, - slab_count_t stride) -{ - return (struct slab_iterator) { - .slabs = slabs, - .next = (((slabs == NULL) || (start < end)) ? NULL - : slabs[start]), - .end = end, - .stride = stride, - }; -} - -/** - * Check whether another vdo_slab would be returned by the iterator. - * - * @param iterator The iterator to poll - * - * @return true if the next call to vdo_next_slab - * will return a vdo_slab - **/ -static inline bool vdo_has_next_slab(const struct slab_iterator *iterator) -{ - return (iterator->next != NULL); -} - -/** - * Get the next vdo_slab, advancing the iterator. - * - * @param iterator The iterator over the vdo_slab chain - * - * @return the next vdo_slab or NULL if the array of slabs is empty - * or if all the appropriate Slabs have been returned - **/ -static inline struct vdo_slab *vdo_next_slab(struct slab_iterator *iterator) -{ - struct vdo_slab *slab = iterator->next; - if ((slab == NULL) - || (slab->slab_number < iterator->end + iterator->stride)) { - iterator->next = NULL; - } else { - iterator->next = - iterator->slabs[slab->slab_number - iterator->stride]; - } - return slab; -} - -#endif // SLAB_ITERATOR_H diff --git a/vdo/slabJournal.h b/vdo/slabJournal.h deleted file mode 100644 index 8184790f..00000000 --- a/vdo/slabJournal.h +++ /dev/null @@ -1,201 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/slabJournal.h#10 $ - */ - -#ifndef SLAB_JOURNAL_H -#define SLAB_JOURNAL_H - -#include - -#include "completion.h" -#include "journalPoint.h" -#include "types.h" - -/** - * Obtain a pointer to a slab_journal structure from a pointer to the - * dirty list entry field within it. - * - * @param entry The list entry to convert - * - * @return The entry as a slab_journal - **/ -struct slab_journal * __must_check -vdo_slab_journal_from_dirty_entry(struct list_head *entry); - -/** - * Create a slab journal. - * - * @param [in] allocator The block allocator which owns this journal - * @param [in] slab The parent slab of the journal - * @param [in] recovery_journal The recovery journal of the VDO - * @param [out] journal_ptr The pointer to hold the new slab journal - * - * @return VDO_SUCCESS or error code - **/ -int __must_check make_vdo_slab_journal(struct block_allocator *allocator, - struct vdo_slab *slab, - struct recovery_journal *recovery_journal, - struct slab_journal **journal_ptr); - -/** - * Free a slab journal. - * - * @param journal The slab journal to free - **/ -void free_vdo_slab_journal(struct slab_journal *journal); - -/** - * Check whether a slab journal is blank, meaning it has never had any entries - * recorded in it. - * - * @param journal The journal to query - * - * @return true if the slab journal has never been modified - **/ -bool __must_check is_vdo_slab_journal_blank(const struct slab_journal *journal); - -/** - * Check whether the slab journal is on the block allocator's list of dirty - * journals. - * - * @param journal The journal to query - * - * @return true if the journal has been added to the dirty list - **/ -bool __must_check is_vdo_slab_journal_dirty(const struct slab_journal *journal); - -/** - * Check whether a slab journal is active. - * - * @param journal The slab journal to check - * - * @return true if the journal is active - **/ -bool __must_check is_vdo_slab_journal_active(struct slab_journal *journal); - -/** - * Abort any VIOs waiting to make slab journal entries. - * - * @param journal The journal to abort - **/ -void abort_vdo_slab_journal_waiters(struct slab_journal *journal); - -/** - * Reopen a slab journal by emptying it and then adding any pending entries. - * - * @param journal The journal to reopen - **/ -void reopen_vdo_slab_journal(struct slab_journal *journal); - -/** - * Attempt to replay a recovery journal entry into a slab journal. - * - * @param journal The slab journal to use - * @param pbn The PBN for the entry - * @param operation The type of entry to add - * @param recovery_point The recovery journal point corresponding to this entry - * @param parent The completion to notify when there is space to add - * the entry if the entry could not be added immediately - * - * @return true if the entry was added immediately - **/ -bool __must_check -attempt_replay_into_vdo_slab_journal(struct slab_journal *journal, - physical_block_number_t pbn, - enum journal_operation operation, - struct journal_point *recovery_point, - struct vdo_completion *parent); - -/** - * Add an entry to a slab journal. - * - * @param journal The slab journal to use - * @param data_vio The data_vio for which to add the entry - **/ -void add_vdo_slab_journal_entry(struct slab_journal *journal, - struct data_vio *data_vio); - -/** - * Adjust the reference count for a slab journal block. Note that when the - * adjustment is negative, the slab journal will be reaped. - * - * @param journal The slab journal - * @param sequence_number The journal sequence number of the referenced block - * @param adjustment Amount to adjust the reference counter - **/ -void adjust_vdo_slab_journal_block_reference(struct slab_journal *journal, - sequence_number_t sequence_number, - int adjustment); - -/** - * Request the slab journal to release the recovery journal lock it may hold on - * a specified recovery journal block. - * - * @param journal The slab journal - * @param recovery_lock The sequence number of the recovery journal block - * whose locks should be released - * - * @return true if the journal does hold a lock on the specified - * block (which it will release) - **/ -bool __must_check -vdo_release_recovery_journal_lock(struct slab_journal *journal, - sequence_number_t recovery_lock); - -/** - * Commit the tail block of a slab journal. - * - * @param journal The journal whose tail block should be committed - **/ -void commit_vdo_slab_journal_tail(struct slab_journal *journal); - -/** - * Drain slab journal I/O. Depending upon the type of drain (as recorded in - * the journal's slab), any dirty journal blocks may be written out. - * - * @param journal The journal to drain - **/ -void drain_vdo_slab_journal(struct slab_journal *journal); - -/** - * Decode the slab journal by reading its tail. - * - * @param journal The journal to decode - **/ -void decode_vdo_slab_journal(struct slab_journal *journal); - -/** - * Check to see if the journal should be scrubbed. - * - * @param journal The slab journal - * - * @return true if the journal requires scrubbing - **/ -bool __must_check -vdo_slab_journal_requires_scrubbing(const struct slab_journal *journal); - -/** - * Dump the slab journal. - * - * @param journal The slab journal to dump - **/ -void dump_vdo_slab_journal(const struct slab_journal *journal); - -#endif // SLAB_JOURNAL_H diff --git a/vdo/slabJournalEraser.c b/vdo/slabJournalEraser.c deleted file mode 100644 index 4589e2bd..00000000 --- a/vdo/slabJournalEraser.c +++ /dev/null @@ -1,131 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/slabJournalEraser.c#17 $ - */ - -#include "slabJournalEraser.h" - -#include "memoryAlloc.h" - -#include "completion.h" -#include "constants.h" -#include "extent.h" -#include "slab.h" -#include "slabDepot.h" -#include "vdo.h" - -struct slab_journal_eraser { - struct vdo_completion *parent; - struct vdo_extent *extent; - char *zero_buffer; - struct slab_iterator slabs; -}; - -/** - * Free the eraser and finish the parent. - * - * @param eraser The eraser that is done - * @param result The result to return to the parent - **/ -static void finish_erasing(struct slab_journal_eraser *eraser, int result) -{ - struct vdo_completion *parent = eraser->parent; - free_vdo_extent(UDS_FORGET(eraser->extent)); - UDS_FREE(eraser->zero_buffer); - UDS_FREE(eraser); - finish_vdo_completion(parent, result); -} - -/** - * Finish erasing slab journals with an error. - * - * @param completion A completion whose parent is the eraser - **/ -static void handle_erasing_error(struct vdo_completion *completion) -{ - struct slab_journal_eraser *eraser = completion->parent; - finish_erasing(eraser, eraser->extent->completion.result); -} - -/** - * Erase the next slab journal. - * - * @param extent_completion A completion whose parent is the eraser - **/ -static void erase_next_slab_journal(struct vdo_completion *extent_completion) -{ - struct vdo_slab *slab; - struct slab_journal_eraser *eraser = extent_completion->parent; - - if (!vdo_has_next_slab(&eraser->slabs)) { - finish_erasing(eraser, VDO_SUCCESS); - return; - } - - slab = vdo_next_slab(&eraser->slabs); - write_vdo_metadata_extent(eraser->extent, slab->journal_origin); -} - -/**********************************************************************/ -void erase_vdo_slab_journals(struct slab_depot *depot, - struct slab_iterator slabs, - struct vdo_completion *parent) -{ - struct slab_journal_eraser *eraser; - block_count_t journal_size; - struct vdo_completion *extent_completion; - - int result = UDS_ALLOCATE(1, struct slab_journal_eraser, __func__, &eraser); - if (result != VDO_SUCCESS) { - finish_vdo_completion(parent, result); - return; - } - - eraser->parent = parent; - eraser->slabs = slabs; - - journal_size = get_vdo_slab_config(depot)->slab_journal_blocks; - result = UDS_ALLOCATE(journal_size * VDO_BLOCK_SIZE, - char, - __func__, - &eraser->zero_buffer); - if (result != VDO_SUCCESS) { - finish_erasing(eraser, result); - return; - } - - result = create_vdo_extent(parent->vdo, - VIO_TYPE_SLAB_JOURNAL, - VIO_PRIORITY_METADATA, - journal_size, - eraser->zero_buffer, - &eraser->extent); - if (result != VDO_SUCCESS) { - finish_erasing(eraser, result); - return; - } - - extent_completion = &eraser->extent->completion; - prepare_vdo_completion(extent_completion, - erase_next_slab_journal, - handle_erasing_error, - vdo_get_callback_thread_id(), - eraser); - erase_next_slab_journal(extent_completion); -} diff --git a/vdo/slabJournalEraser.h b/vdo/slabJournalEraser.h deleted file mode 100644 index 1359c474..00000000 --- a/vdo/slabJournalEraser.h +++ /dev/null @@ -1,39 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/slabJournalEraser.h#9 $ - */ - -#ifndef SLAB_JOURNAL_ERASER_H -#define SLAB_JOURNAL_ERASER_H - -#include "slabIterator.h" -#include "types.h" - -/** - * Begin erasing slab journals, one at a time. - * - * @param depot The depot from which to erase - * @param slabs The slabs whose journals need erasing - * @param parent The object to notify when complete - **/ -void erase_vdo_slab_journals(struct slab_depot *depot, - struct slab_iterator slabs, - struct vdo_completion *parent); - -#endif // SLAB_JOURNAL_ERASER_H diff --git a/vdo/slabJournalFormat.c b/vdo/slabJournalFormat.c deleted file mode 100644 index 1aca4093..00000000 --- a/vdo/slabJournalFormat.c +++ /dev/null @@ -1,41 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/slabJournalFormat.c#9 $ - */ - -#include "slabJournalFormat.h" - -#include "journalPoint.h" -#include "types.h" - -/**********************************************************************/ -struct slab_journal_entry -decode_vdo_slab_journal_entry(struct packed_slab_journal_block *block, - journal_entry_count_t entry_count) -{ - struct slab_journal_entry entry = - unpack_vdo_slab_journal_entry(&block->payload.entries[entry_count]); - if (block->header.has_block_map_increments && - ((block->payload.full_entries.entry_types[entry_count / 8] & - ((byte)1 << (entry_count % 8))) != 0)) { - entry.operation = BLOCK_MAP_INCREMENT; - } - return entry; -} - diff --git a/vdo/slabJournalInternals.h b/vdo/slabJournalInternals.h deleted file mode 100644 index 772f4339..00000000 --- a/vdo/slabJournalInternals.h +++ /dev/null @@ -1,207 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/slabJournalInternals.h#5 $ - */ - -#ifndef SLAB_JOURNAL_INTERNALS_H -#define SLAB_JOURNAL_INTERNALS_H - -#include "slabJournal.h" - -#include "numeric.h" - -#include "blockAllocatorInternals.h" -#include "blockMapEntry.h" -#include "journalPoint.h" -#include "slab.h" -#include "slabJournalFormat.h" -#include "slabSummary.h" -#include "statistics.h" -#include "waitQueue.h" - -struct journal_lock { - uint16_t count; - sequence_number_t recovery_start; -}; - -struct slab_journal { - /** A waiter object for getting a VIO pool entry */ - struct waiter resource_waiter; - /** A waiter object for updating the slab summary */ - struct waiter slab_summary_waiter; - /** A waiter object for getting an extent with which to flush */ - struct waiter flush_waiter; - /** The queue of VIOs waiting to make an entry */ - struct wait_queue entry_waiters; - /** The parent slab reference of this journal */ - struct vdo_slab *slab; - - /** Whether a tail block commit is pending */ - bool waiting_to_commit; - /** Whether the journal is updating the slab summary */ - bool updating_slab_summary; - /** Whether the journal is adding entries from the entry_waiters queue */ - bool adding_entries; - /** Whether a partial write is in progress */ - bool partial_write_in_progress; - - /** The oldest block in the journal on disk */ - sequence_number_t head; - /** The oldest block in the journal which may not be reaped */ - sequence_number_t unreapable; - /** The end of the half-open interval of the active journal */ - sequence_number_t tail; - /** The next journal block to be committed */ - sequence_number_t next_commit; - /** The tail sequence number that is written in the slab summary */ - sequence_number_t summarized; - /** The tail sequence number that was last summarized in slab summary */ - sequence_number_t last_summarized; - - /** The sequence number of the recovery journal lock */ - sequence_number_t recovery_lock; - - /** - * The number of entries which fit in a single block. Can't use the - * constant because unit tests change this number. - **/ - journal_entry_count_t entries_per_block; - /** - * The number of full entries which fit in a single block. Can't use - * the constant because unit tests change this number. - **/ - journal_entry_count_t full_entries_per_block; - - /** The recovery journal of the VDO (slab journal holds locks on it) */ - struct recovery_journal *recovery_journal; - - /** The slab summary to update tail block location */ - struct slab_summary_zone *summary; - /** The statistics shared by all slab journals in our physical zone */ - struct slab_journal_statistics *events; - /** - * A list of the VIO pool entries for outstanding journal block writes - */ - struct list_head uncommitted_blocks; - - /** - * The current tail block header state. This will be packed into - * the block just before it is written. - **/ - struct slab_journal_block_header tail_header; - /** A pointer to a block-sized buffer holding the packed block data */ - struct packed_slab_journal_block *block; - - /** The number of blocks in the on-disk journal */ - block_count_t size; - /** The number of blocks at which to start pushing reference blocks */ - block_count_t flushing_threshold; - /** The number of blocks at which all reference blocks should be writing - */ - block_count_t flushing_deadline; - /** - * The number of blocks at which to wait for reference blocks to write - */ - block_count_t blocking_threshold; - /** - * The number of blocks at which to scrub the slab before coming online - */ - block_count_t scrubbing_threshold; - - /** - * This list entry is for block_allocator to keep a queue of dirty - * journals - */ - struct list_head dirty_entry; - - /** The lock for the oldest unreaped block of the journal */ - struct journal_lock *reap_lock; - /** The locks for each on disk block */ - struct journal_lock locks[]; -}; - -/** - * Get the slab journal block offset of the given sequence number. - * - * @param journal The slab journal - * @param sequence The sequence number - * - * @return the offset corresponding to the sequence number - **/ -static inline tail_block_offset_t __must_check -get_vdo_slab_journal_block_offset(struct slab_journal *journal, - sequence_number_t sequence) -{ - return (sequence % journal->size); -} - -/** - * Encode a slab journal entry (exposed for unit tests). - * - * @param tail_header The unpacked header for the block - * @param payload The journal block payload to hold the entry - * @param sbn The slab block number of the entry to encode - * @param operation The type of the entry - **/ -void encode_vdo_slab_journal_entry(struct slab_journal_block_header *tail_header, - slab_journal_payload *payload, - slab_block_number sbn, - enum journal_operation operation); - -/** - * Generate the packed encoding of a slab journal entry. - * - * @param packed The entry into which to pack the values - * @param sbn The slab block number of the entry to encode - * @param is_increment The increment flag - **/ -static inline void pack_vdo_slab_journal_entry(packed_slab_journal_entry *packed, - slab_block_number sbn, - bool is_increment) -{ - packed->offset_low8 = (sbn & 0x0000FF); - packed->offset_mid8 = (sbn & 0x00FF00) >> 8; - packed->offset_high7 = (sbn & 0x7F0000) >> 16; - packed->increment = is_increment ? 1 : 0; -} - -/** - * Decode the packed representation of a slab block header. - * - * @param packed The packed header to decode - * @param header The header into which to unpack the values - **/ -static inline void -unpack_vdo_slab_journal_block_header( - const struct packed_slab_journal_block_header *packed, - struct slab_journal_block_header *header) -{ - *header = (struct slab_journal_block_header) { - .head = __le64_to_cpu(packed->head), - .sequence_number = __le64_to_cpu(packed->sequence_number), - .nonce = __le64_to_cpu(packed->nonce), - .entry_count = __le16_to_cpu(packed->entry_count), - .metadata_type = packed->metadata_type, - .has_block_map_increments = packed->has_block_map_increments, - }; - unpack_vdo_journal_point(&packed->recovery_point, - &header->recovery_point); -} - -#endif // SLAB_JOURNAL_INTERNALS_H diff --git a/vdo/slabScrubber.h b/vdo/slabScrubber.h deleted file mode 100644 index a1888fb9..00000000 --- a/vdo/slabScrubber.h +++ /dev/null @@ -1,153 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/slabScrubber.h#9 $ - */ - -#ifndef SLAB_SCRUBBER_H -#define SLAB_SCRUBBER_H - -#include "completion.h" -#include "types.h" -#include "waitQueue.h" - -/** - * Create a slab scrubber - * - * @param vdo The VDO - * @param slab_journal_size The size of a slab journal in blocks - * @param read_only_notifier The context for entering read-only mode - * @param scrubber_ptr A pointer to hold the scrubber - * - * @return VDO_SUCCESS or an error - **/ -int __must_check -make_vdo_slab_scrubber(struct vdo *vdo, - block_count_t slab_journal_size, - struct read_only_notifier *read_only_notifier, - struct slab_scrubber **scrubber_ptr); - -/** - * Free a slab scrubber. - * - * @param scrubber The scrubber to destroy - **/ -void free_vdo_slab_scrubber(struct slab_scrubber *scrubber); - -/** - * Check whether a scrubber has slabs to scrub. - * - * @param scrubber The scrubber to check - * - * @return true if the scrubber has slabs to scrub - **/ -bool __must_check vdo_has_slabs_to_scrub(struct slab_scrubber *scrubber); - -/** - * Register a slab with a scrubber. - * - * @param scrubber The scrubber - * @param slab The slab to scrub - * @param high_priority true if the slab should be put on the - * high-priority queue - **/ -void vdo_register_slab_for_scrubbing(struct slab_scrubber *scrubber, - struct vdo_slab *slab, - bool high_priority); - -/** - * Scrub all the slabs which have been registered with a slab scrubber. - * - * @param scrubber The scrubber - * @param parent The object to notify when scrubbing is complete - * @param callback The function to run when scrubbing is complete - * @param error_handler The handler for scrubbing errors - **/ -void scrub_vdo_slabs(struct slab_scrubber *scrubber, - void *parent, - vdo_action *callback, - vdo_action *error_handler); - -/** - * Scrub any slabs which have been registered at high priority with a slab - * scrubber. - * - * @param scrubber The scrubber - * @param scrub_at_least_one true if one slab should always be - * scrubbed, even if there are no high-priority slabs - * (and there is at least one low priority slab) - * @param parent The completion to notify when scrubbing is - * complete - * @param callback The function to run when scrubbing is complete - * @param error_handler The handler for scrubbing errors - **/ -void scrub_high_priority_vdo_slabs(struct slab_scrubber *scrubber, - bool scrub_at_least_one, - struct vdo_completion *parent, - vdo_action *callback, - vdo_action *error_handler); - -/** - * Tell the scrubber to stop scrubbing after it finishes the slab it is - * currently working on. - * - * @param scrubber The scrubber to stop - * @param parent The completion to notify when scrubbing has stopped - **/ -void stop_vdo_slab_scrubbing(struct slab_scrubber *scrubber, - struct vdo_completion *parent); - -/** - * Tell the scrubber to resume scrubbing if it has been stopped. - * - * @param scrubber The scrubber to resume - * @param parent The object to notify once scrubbing has resumed - **/ -void resume_vdo_slab_scrubbing(struct slab_scrubber *scrubber, - struct vdo_completion *parent); - -/** - * Wait for a clean slab. - * - * @param scrubber The scrubber on which to wait - * @param waiter The waiter - * - * @return VDO_SUCCESS if the waiter was queued, VDO_NO_SPACE if there are no - * slabs to scrub, and some other error otherwise - **/ -int enqueue_clean_vdo_slab_waiter(struct slab_scrubber *scrubber, - struct waiter *waiter); - -/** - * Get the number of slabs that are unrecovered or being scrubbed. - * - * @param scrubber The scrubber to query - * - * @return the number of slabs that are unrecovered or being scrubbed - **/ -slab_count_t __must_check -get_scrubber_vdo_slab_count(const struct slab_scrubber *scrubber); - -/** - * Dump information about a slab scrubber to the log for debugging. - * - * @param scrubber The scrubber to dump - **/ -void dump_vdo_slab_scrubber(const struct slab_scrubber *scrubber); - -#endif /* SLAB_SCRUBBER_H */ diff --git a/vdo/slabScrubberInternals.h b/vdo/slabScrubberInternals.h deleted file mode 100644 index b0665342..00000000 --- a/vdo/slabScrubberInternals.h +++ /dev/null @@ -1,62 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/slabScrubberInternals.h#2 $ - */ - -#ifndef SLAB_SCRUBBER_INTERNALS_H -#define SLAB_SCRUBBER_INTERNALS_H - -#include - -#include "slabScrubber.h" - -#include "adminState.h" -#include "extent.h" - -struct slab_scrubber { - struct vdo_completion completion; - /** The queue of slabs to scrub first */ - struct list_head high_priority_slabs; - /** The queue of slabs to scrub once there are no high_priority_slabs */ - struct list_head slabs; - /** The queue of VIOs waiting for a slab to be scrubbed */ - struct wait_queue waiters; - - /* - * The number of slabs that are unrecovered or being scrubbed. This - * field is modified by the physical zone thread, but is queried by - * other threads. - */ - slab_count_t slab_count; - - /** The administrative state of the scrubber */ - struct admin_state admin_state; - /** Whether to only scrub high-priority slabs */ - bool high_priority_only; - /** The context for entering read-only mode */ - struct read_only_notifier *read_only_notifier; - /** The slab currently being scrubbed */ - struct vdo_slab *slab; - /** The extent for loading slab journal blocks */ - struct vdo_extent *extent; - /** A buffer to store the slab journal blocks */ - char *journal_data; -}; - -#endif // SLAB_SCRUBBER_INTERNALS_H diff --git a/vdo/slabSummary.c b/vdo/slabSummary.c deleted file mode 100644 index a5da4105..00000000 --- a/vdo/slabSummary.c +++ /dev/null @@ -1,702 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/slabSummary.c#28 $ - */ - -#include "slabSummary.h" - -#include "memoryAlloc.h" -#include "permassert.h" - -#include "adminState.h" -#include "constants.h" -#include "extent.h" -#include "readOnlyNotifier.h" -#include "slabSummaryFormat.h" -#include "slabSummaryInternals.h" -#include "threadConfig.h" -#include "types.h" - -// FULLNESS HINT COMPUTATION - -/** - * Translate a slab's free block count into a 'fullness hint' that can be - * stored in a slab_summary_entry's 7 bits that are dedicated to its free - * count. - * - * Note: the number of free blocks must be strictly less than 2^23 blocks, - * even though theoretically slabs could contain precisely 2^23 blocks; there - * is an assumption that at least one block is used by metadata. This - * assumption is necessary; otherwise, the fullness hint might overflow. - * The fullness hint formula is roughly (fullness >> 16) & 0x7f, but - * ((1 << 23) >> 16) & 0x7f is the same as (0 >> 16) & 0x7f, namely 0, which - * is clearly a bad hint if it could indicate both 2^23 free blocks or 0 free - * blocks. - * - * @param summary The summary which is being updated - * @param free_blocks The number of free blocks - * - * @return A fullness hint, which can be stored in 7 bits. - **/ -static uint8_t __must_check -compute_fullness_hint(struct slab_summary *summary, block_count_t free_blocks) -{ - block_count_t hint; - ASSERT_LOG_ONLY((free_blocks < (1 << 23)), - "free blocks must be less than 2^23"); - - if (free_blocks == 0) { - return 0; - } - - hint = free_blocks >> summary->hint_shift; - return ((hint == 0) ? 1 : hint); -} - -/** - * Translate a slab's free block hint into an approximate count, such that - * compute_fullness_hint() is the inverse function of - * get_approximate_free_blocks() - * (i.e. compute_fullness_hint(get_approximate_free_blocks(x)) == x). - * - * @param summary The summary from which the hint was obtained - * @param free_block_hint The hint read from the summary - * - * @return An approximation to the free block count - **/ -static block_count_t __must_check -get_approximate_free_blocks(struct slab_summary *summary, - uint8_t free_block_hint) -{ - return ((block_count_t) free_block_hint) << summary->hint_shift; -} - -// MAKE/FREE FUNCTIONS - -/**********************************************************************/ -static void launch_write(struct slab_summary_block *summary_block); - -/** - * Initialize a slab_summary_block. - * - * @param vdo The vdo - * @param summary_zone The parent slab_summary_zone - * @param thread_id The ID of the thread of physical zone of this - * block - * @param entries The entries this block manages - * @param index The index of this block in its zone's summary - * @param slab_summary_block The block to intialize - * - * @return VDO_SUCCESS or an error - **/ -static int -initialize_slab_summary_block(struct vdo *vdo, - struct slab_summary_zone *summary_zone, - thread_id_t thread_id, - struct slab_summary_entry *entries, - block_count_t index, - struct slab_summary_block *slab_summary_block) -{ - int result = UDS_ALLOCATE(VDO_BLOCK_SIZE, char, __func__, - &slab_summary_block->outgoing_entries); - if (result != VDO_SUCCESS) { - return result; - } - - result = create_metadata_vio(vdo, - VIO_TYPE_SLAB_SUMMARY, - VIO_PRIORITY_METADATA, - slab_summary_block, - slab_summary_block->outgoing_entries, - &slab_summary_block->vio); - if (result != VDO_SUCCESS) { - return result; - } - - slab_summary_block->vio->completion.callback_thread_id = thread_id; - slab_summary_block->zone = summary_zone; - slab_summary_block->entries = entries; - slab_summary_block->index = index; - return VDO_SUCCESS; -} - -/** - * Create a new, empty slab_summary_zone object. - * - * @param summary The summary to which the new zone will belong - * @param vdo The vdo - * @param zone_number The zone this is - * @param thread_id The ID of the thread for this zone - * @param entries The buffer to hold the entries in this zone - * - * @return VDO_SUCCESS or an error - **/ -static int make_slab_summary_zone(struct slab_summary *summary, - struct vdo *vdo, - zone_count_t zone_number, - thread_id_t thread_id, - struct slab_summary_entry *entries) -{ - struct slab_summary_zone *summary_zone; - block_count_t i; - int result = UDS_ALLOCATE_EXTENDED(struct slab_summary_zone, - summary->blocks_per_zone, - struct slab_summary_block, __func__, - &summary->zones[zone_number]); - if (result != VDO_SUCCESS) { - return result; - } - - summary_zone = summary->zones[zone_number]; - summary_zone->summary = summary; - summary_zone->zone_number = zone_number; - summary_zone->entries = entries; - set_vdo_admin_state_code(&summary_zone->state, - VDO_ADMIN_STATE_NORMAL_OPERATION); - - // Initialize each block. - for (i = 0; i < summary->blocks_per_zone; i++) { - result = initialize_slab_summary_block(vdo, summary_zone, - thread_id, entries, i, - &summary_zone->summary_blocks[i]); - if (result != VDO_SUCCESS) { - return result; - } - entries += summary->entries_per_block; - } - - return VDO_SUCCESS; -} - -/**********************************************************************/ -int make_vdo_slab_summary(struct vdo *vdo, - struct partition *partition, - const struct thread_config *thread_config, - unsigned int slab_size_shift, - block_count_t maximum_free_blocks_per_slab, - struct read_only_notifier *read_only_notifier, - struct slab_summary **slab_summary_ptr) -{ - struct slab_summary *summary; - size_t total_entries, i; - uint8_t hint; - zone_count_t zone; - block_count_t blocks_per_zone = - get_vdo_slab_summary_zone_size(VDO_BLOCK_SIZE); - slab_count_t entries_per_block = MAX_VDO_SLABS / blocks_per_zone; - int result = ASSERT((entries_per_block * blocks_per_zone) == MAX_VDO_SLABS, - "block size must be a multiple of entry size"); - if (result != VDO_SUCCESS) { - return result; - } - - if (partition == NULL) { - // Don't make a slab summary for the formatter since it doesn't - // need it. - return VDO_SUCCESS; - } - - result = UDS_ALLOCATE_EXTENDED(struct slab_summary, - thread_config->physical_zone_count, - struct slab_summary_zone *, - __func__, - &summary); - if (result != VDO_SUCCESS) { - return result; - } - - summary->zone_count = thread_config->physical_zone_count; - summary->read_only_notifier = read_only_notifier; - summary->hint_shift = get_vdo_slab_summary_hint_shift(slab_size_shift); - summary->blocks_per_zone = blocks_per_zone; - summary->entries_per_block = entries_per_block; - - total_entries = MAX_VDO_SLABS * MAX_VDO_PHYSICAL_ZONES; - result = UDS_ALLOCATE(total_entries, struct slab_summary_entry, - "summary entries", &summary->entries); - if (result != VDO_SUCCESS) { - free_vdo_slab_summary(summary); - return result; - } - - // Initialize all the entries. - hint = compute_fullness_hint(summary, maximum_free_blocks_per_slab); - for (i = 0; i < total_entries; i++) { - // This default tail block offset must be reflected in - // slabJournal.c::read_slab_journal_tail(). - summary->entries[i] = (struct slab_summary_entry) { - .tail_block_offset = 0, - .fullness_hint = hint, - .load_ref_counts = false, - .is_dirty = false, - }; - } - - set_vdo_slab_summary_origin(summary, partition); - for (zone = 0; zone < summary->zone_count; zone++) { - result = - make_slab_summary_zone(summary, vdo, zone, - vdo_get_physical_zone_thread(thread_config, - zone), - summary->entries + - (MAX_VDO_SLABS * zone)); - if (result != VDO_SUCCESS) { - free_vdo_slab_summary(summary); - return result; - } - } - - *slab_summary_ptr = summary; - return VDO_SUCCESS; -} - -/** - * Free a slab summary zone. - * - * @param zone The zone to free - **/ -static void free_summary_zone(struct slab_summary_zone *zone) -{ - block_count_t i; - - if (zone == NULL) { - return; - } - - for (i = 0; i < zone->summary->blocks_per_zone; i++) { - free_vio(UDS_FORGET(zone->summary_blocks[i].vio)); - UDS_FREE(UDS_FORGET(zone->summary_blocks[i].outgoing_entries)); - } - - UDS_FREE(zone); -} - -/**********************************************************************/ -void free_vdo_slab_summary(struct slab_summary *summary) -{ - zone_count_t zone; - - if (summary == NULL) { - return; - } - - for (zone = 0; zone < summary->zone_count; zone++) { - free_summary_zone(UDS_FORGET(summary->zones[zone])); - } - - UDS_FREE(UDS_FORGET(summary->entries)); - UDS_FREE(summary); -} - -/**********************************************************************/ -struct slab_summary_zone * -vdo_get_slab_summary_for_zone(struct slab_summary *summary, zone_count_t zone) -{ - return summary->zones[zone]; -} - -// WRITING FUNCTIONALITY - -/** - * Check whether a summary zone has finished draining. - * - * @param summary_zone The zone to check - **/ -static void -vdo_check_for_drain_complete(struct slab_summary_zone *summary_zone) -{ - if (!is_vdo_state_draining(&summary_zone->state) - || (summary_zone->write_count > 0)) { - return; - } - - finish_vdo_operation(&summary_zone->state, - (vdo_is_read_only(summary_zone->summary->read_only_notifier) - ? VDO_READ_ONLY : VDO_SUCCESS)); -} - -/** - * Wake all the waiters in a given queue. If the VDO is in read-only mode they - * will be given a VDO_READ_ONLY error code as their context, otherwise they - * will be given VDO_SUCCESS. - * - * @param summary_zone The slab summary which owns the queue - * @param queue The queue to notify - **/ -static void notify_waiters(struct slab_summary_zone *summary_zone, - struct wait_queue *queue) -{ - int result = (vdo_is_read_only(summary_zone->summary->read_only_notifier) - ? VDO_READ_ONLY - : VDO_SUCCESS); - notify_all_waiters(queue, NULL, &result); -} - -/** - * Finish processing a block which attempted to write, whether or not the - * attempt succeeded. - * - * @param block The block - **/ -static void -finish_updating_slab_summary_block(struct slab_summary_block *block) -{ - notify_waiters(block->zone, &block->current_update_waiters); - block->writing = false; - block->zone->write_count--; - if (has_waiters(&block->next_update_waiters)) { - launch_write(block); - } else { - vdo_check_for_drain_complete(block->zone); - } -} - -/** - * This is the callback for a successful block write. - * - * @param completion The write VIO - **/ -static void finish_update(struct vdo_completion *completion) -{ - struct slab_summary_block *block = completion->parent; - atomic64_inc(&block->zone->summary->statistics.blocks_written); - finish_updating_slab_summary_block(block); -} - -/** - * Handle an error writing a slab summary block. - * - * @param completion The write VIO - **/ -static void handle_write_error(struct vdo_completion *completion) -{ - struct slab_summary_block *block = completion->parent; - vdo_enter_read_only_mode(block->zone->summary->read_only_notifier, - completion->result); - finish_updating_slab_summary_block(block); -} - -/** - * Write a slab summary block unless it is currently out for writing. - * - * @param [in] block The block that needs to be committed - **/ -static void launch_write(struct slab_summary_block *block) -{ - struct slab_summary_zone *zone = block->zone; - struct slab_summary *summary = zone->summary; - physical_block_number_t pbn; - - if (block->writing) { - return; - } - - zone->write_count++; - transfer_all_waiters(&block->next_update_waiters, - &block->current_update_waiters); - block->writing = true; - - if (vdo_is_read_only(summary->read_only_notifier)) { - finish_updating_slab_summary_block(block); - return; - } - - memcpy(block->outgoing_entries, block->entries, - sizeof(struct slab_summary_entry) * summary->entries_per_block); - - // Flush before writing to ensure that the slab journal tail blocks and - // reference updates covered by this summary update are stable - // (VDO-2332). - pbn = summary->origin + - (summary->blocks_per_zone * zone->zone_number) + block->index; - launch_write_metadata_vio_with_flush(block->vio, pbn, finish_update, - handle_write_error, true, false); -} - -/** - * Initiate a drain. - * - * Implements vdo_admin_initiator. - **/ -static void initiate_drain(struct admin_state *state) -{ - vdo_check_for_drain_complete(container_of(state, - struct slab_summary_zone, - state)); -} - -/**********************************************************************/ -void drain_vdo_slab_summary_zone(struct slab_summary_zone *summary_zone, - const struct admin_state_code *operation, - struct vdo_completion *parent) -{ - start_vdo_draining(&summary_zone->state, operation, parent, - initiate_drain); -} - -/**********************************************************************/ -void resume_vdo_slab_summary_zone(struct slab_summary_zone *summary_zone, - struct vdo_completion *parent) -{ - finish_vdo_completion(parent, - resume_vdo_if_quiescent(&summary_zone->state)); -} - -// READ/UPDATE FUNCTIONS - -/** - * Get the summary block, and offset into it, for storing the summary for a - * slab. - * - * @param summary_zone The slab_summary_zone being queried - * @param slab_number The slab whose summary location is sought - * - * @return A pointer to the slab_summary_block containing this - * slab_summary_entry - **/ -static struct slab_summary_block * -get_summary_block_for_slab(struct slab_summary_zone *summary_zone, - slab_count_t slab_number) -{ - slab_count_t entries_per_block = - summary_zone->summary->entries_per_block; - return &summary_zone->summary_blocks[slab_number / entries_per_block]; -} - -/**********************************************************************/ -void vdo_update_slab_summary_entry(struct slab_summary_zone *summary_zone, - struct waiter *waiter, slab_count_t slab_number, - tail_block_offset_t tail_block_offset, - bool load_ref_counts, bool is_clean, - block_count_t free_blocks) -{ - struct slab_summary_block *block = - get_summary_block_for_slab(summary_zone, slab_number); - int result; - if (vdo_is_read_only(summary_zone->summary->read_only_notifier)) { - result = VDO_READ_ONLY; - } else if (is_vdo_state_draining(&summary_zone->state) - || is_vdo_state_quiescent(&summary_zone->state)) { - result = VDO_INVALID_ADMIN_STATE; - } else { - uint8_t hint = compute_fullness_hint(summary_zone->summary, - free_blocks); - struct slab_summary_entry *entry = - &summary_zone->entries[slab_number]; - *entry = (struct slab_summary_entry) { - .tail_block_offset = tail_block_offset, - .load_ref_counts = - (entry->load_ref_counts || load_ref_counts), - .is_dirty = !is_clean, - .fullness_hint = hint, - }; - result = enqueue_waiter(&block->next_update_waiters, waiter); - } - - if (result != VDO_SUCCESS) { - waiter->callback(waiter, &result); - return; - } - - launch_write(block); -} - -/**********************************************************************/ -tail_block_offset_t -vdo_get_summarized_tail_block_offset(struct slab_summary_zone *summary_zone, - slab_count_t slab_number) -{ - return summary_zone->entries[slab_number].tail_block_offset; -} - -/**********************************************************************/ -bool vdo_must_load_ref_counts(struct slab_summary_zone *summary_zone, - slab_count_t slab_number) -{ - return summary_zone->entries[slab_number].load_ref_counts; -} - -/**********************************************************************/ -bool vdo_get_summarized_cleanliness(struct slab_summary_zone *summary_zone, - slab_count_t slab_number) -{ - return !summary_zone->entries[slab_number].is_dirty; -} - -/**********************************************************************/ -block_count_t -get_summarized_free_block_count(struct slab_summary_zone *summary_zone, - slab_count_t slab_number) -{ - struct slab_summary_entry *entry = &summary_zone->entries[slab_number]; - return get_approximate_free_blocks(summary_zone->summary, - entry->fullness_hint); -} - -/**********************************************************************/ -void vdo_get_summarized_ref_counts_state(struct slab_summary_zone *summary_zone, - slab_count_t slab_number, - size_t *free_block_hint, bool *is_clean) -{ - struct slab_summary_entry *entry = &summary_zone->entries[slab_number]; - *free_block_hint = entry->fullness_hint; - *is_clean = !entry->is_dirty; -} - -/**********************************************************************/ -void vdo_get_summarized_slab_statuses(struct slab_summary_zone *summary_zone, - slab_count_t slab_count, - struct slab_status *statuses) -{ - slab_count_t i; - for (i = 0; i < slab_count; i++) { - statuses[i] = (struct slab_status){ - .slab_number = i, - .is_clean = !summary_zone->entries[i].is_dirty, - .emptiness = summary_zone->entries[i].fullness_hint}; - } -} - -// RESIZE FUNCTIONS - -/**********************************************************************/ -void set_vdo_slab_summary_origin(struct slab_summary *summary, - struct partition *partition) -{ - summary->origin = get_vdo_fixed_layout_partition_offset(partition); -} - -// COMBINING FUNCTIONS (LOAD) - -/** - * Clean up after saving out the combined slab summary. This callback is - * registered in finish_loading_summary() and load_vdo_slab_summary(). - * - * @param completion The extent which was used to write the summary data - **/ -static void finish_combining_zones(struct vdo_completion *completion) -{ - struct slab_summary *summary = completion->parent; - int result = completion->result; - free_vdo_extent(vdo_completion_as_extent(UDS_FORGET(completion))); - finish_vdo_loading_with_result(&summary->zones[0]->state, result); -} - -/**********************************************************************/ -void vdo_slab_summary_combine_zones(struct slab_summary *summary) -{ - // Combine all the old summary data into the portion of the buffer - // corresponding to the first zone. - zone_count_t zone = 0; - if (summary->zones_to_combine > 1) { - slab_count_t entry_number; - for (entry_number = 0; entry_number < MAX_VDO_SLABS; - entry_number++) { - if (zone != 0) { - memcpy(summary->entries + entry_number, - summary->entries + - (zone * MAX_VDO_SLABS) + - entry_number, - sizeof(struct slab_summary_entry)); - } - zone++; - if (zone == summary->zones_to_combine) { - zone = 0; - } - } - } - - // Copy the combined data to each zones's region of the buffer. - for (zone = 1; zone < MAX_VDO_PHYSICAL_ZONES; zone++) { - memcpy(summary->entries + (zone * MAX_VDO_SLABS), - summary->entries, - MAX_VDO_SLABS * sizeof(struct slab_summary_entry)); - } -} - -/** - * Combine the slab summary data from all the previously written zones - * and copy the combined summary to each partition's data region. Then write - * the combined summary back out to disk. This callback is registered in - * load_vdo_slab_summary(). - * - * @param completion The extent which was used to read the summary data - **/ -static void finish_loading_summary(struct vdo_completion *completion) -{ - struct slab_summary *summary = completion->parent; - struct vdo_extent *extent = vdo_completion_as_extent(completion); - - // Combine the zones so each zone is correct for all slabs. - vdo_slab_summary_combine_zones(summary); - - // Write the combined summary back out. - extent->completion.callback = finish_combining_zones; - write_vdo_metadata_extent(extent, summary->origin); -} - -/**********************************************************************/ -void load_vdo_slab_summary(struct slab_summary *summary, - const struct admin_state_code *operation, - zone_count_t zones_to_combine, - struct vdo_completion *parent) -{ - struct vdo_extent *extent; - block_count_t blocks; - int result; - - struct slab_summary_zone *zone = summary->zones[0]; - if (!start_vdo_loading(&zone->state, operation, parent, NULL)) { - return; - } - - blocks = summary->blocks_per_zone * MAX_VDO_PHYSICAL_ZONES; - result = create_vdo_extent(parent->vdo, VIO_TYPE_SLAB_SUMMARY, - VIO_PRIORITY_METADATA, blocks, - (char *)summary->entries, &extent); - if (result != VDO_SUCCESS) { - finish_vdo_loading_with_result(&zone->state, result); - return; - } - - if ((operation == VDO_ADMIN_STATE_FORMATTING) || - (operation == VDO_ADMIN_STATE_LOADING_FOR_REBUILD)) { - prepare_vdo_completion(&extent->completion, - finish_combining_zones, - finish_combining_zones, 0, summary); - write_vdo_metadata_extent(extent, summary->origin); - return; - } - - summary->zones_to_combine = zones_to_combine; - prepare_vdo_completion(&extent->completion, finish_loading_summary, - finish_combining_zones, 0, summary); - read_vdo_metadata_extent(extent, summary->origin); -} - -/**********************************************************************/ -struct slab_summary_statistics -get_vdo_slab_summary_statistics(const struct slab_summary *summary) -{ - const struct atomic_slab_summary_statistics *atoms = - &summary->statistics; - return (struct slab_summary_statistics) { - .blocks_written = atomic64_read(&atoms->blocks_written), - }; -} diff --git a/vdo/slabSummary.h b/vdo/slabSummary.h deleted file mode 100644 index c4214d46..00000000 --- a/vdo/slabSummary.h +++ /dev/null @@ -1,256 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/slabSummary.h#14 $ - */ - -#ifndef SLAB_SUMMARY_H -#define SLAB_SUMMARY_H - -#include "completion.h" -#include "fixedLayout.h" -#include "slab.h" -#include "slabSummaryFormat.h" -#include "statistics.h" -#include "types.h" -#include "waitQueue.h" - -/** - * The slab_summary provides hints during load and recovery about the state - * of the slabs in order to avoid the need to read the slab journals in their - * entirety before a VDO can come online. - * - * The information in the summary for each slab includes the rough number of - * free blocks (which is used to prioritize scrubbing), the cleanliness of a - * slab (so that clean slabs containing free space will be used on restart), - * and the location of the tail block of the slab's journal. - * - * The slab_summary has its own partition at the end of the volume which is - * sized to allow for a complete copy of the summary for each of up to 16 - * physical zones. - * - * During resize, the slab_summary moves its backing partition and is saved - * once moved; the slab_summary is not permitted to overwrite the previous - * recovery journal space. - * - * The slab_summary does not have its own version information, but relies on - * the VDO volume version number. - **/ - -/** - * A slab status is a very small structure for use in determining the ordering - * of slabs in the scrubbing process. - **/ -struct slab_status { - slab_count_t slab_number; - bool is_clean; - uint8_t emptiness; -}; - -/** - * Create a slab summary. - * - * @param [in] vdo The vdo - * @param [in] partition The partition to hold the summary - * @param [in] thread_config The thread config of the VDO - * @param [in] slab_size_shift The number of bits in the slab - * size - * @param [in] maximum_free_blocks_per_slab The maximum number of free blocks - * a slab can have - * @param [in] read_only_notifier The context for entering - * read-only mode - * @param [out] slab_summary_ptr A pointer to hold the summary - * - * @return VDO_SUCCESS or an error - **/ -int __must_check -make_vdo_slab_summary(struct vdo *vdo, - struct partition *partition, - const struct thread_config *thread_config, - unsigned int slab_size_shift, - block_count_t maximum_free_blocks_per_slab, - struct read_only_notifier *read_only_notifier, - struct slab_summary **slab_summary_ptr); - -/** - * Destroy a slab summary. - * - * @param summary The slab summary to free - **/ -void free_vdo_slab_summary(struct slab_summary *summary); - -/** - * Get the portion of the slab summary for a specified zone. - * - * @param summary The slab summary - * @param zone The zone - * - * @return The portion of the slab summary for the specified zone - **/ -struct slab_summary_zone * __must_check -vdo_get_slab_summary_for_zone(struct slab_summary *summary, zone_count_t zone); - -/** - * Drain a zone of the slab summary. - * - * @param summary_zone The zone to drain - * @param operation The type of drain to perform - * @param parent The object to notify when the suspend is complete - **/ -void drain_vdo_slab_summary_zone(struct slab_summary_zone *summary_zone, - const struct admin_state_code *operation, - struct vdo_completion *parent); - -/** - * Resume a zone of the slab summary. - * - * @param summary_zone The zone to resume - * @param parent The object to notify when the zone is resumed - **/ -void resume_vdo_slab_summary_zone(struct slab_summary_zone *summary_zone, - struct vdo_completion *parent); - -/** - * Update the entry for a slab. - * - * @param summary_zone The slab_summary_zone for the zone of the slab - * @param waiter The waiter that is updating the summary - * @param slab_number The slab number to update - * @param tail_block_offset The offset of slab journal's tail block - * @param load_ref_counts Whether the ref_counts must be loaded from the - * layer on the next load - * @param is_clean Whether the slab is clean - * @param free_blocks The number of free blocks - **/ -void vdo_update_slab_summary_entry(struct slab_summary_zone *summary_zone, - struct waiter *waiter, - slab_count_t slab_number, - tail_block_offset_t tail_block_offset, - bool load_ref_counts, - bool is_clean, - block_count_t free_blocks); - -/** - * Get the stored tail block offset for a slab. - * - * @param summary_zone The slab_summary_zone to use - * @param slab_number The slab number to get the offset for - * - * @return The tail block offset for the slab - **/ -tail_block_offset_t __must_check -vdo_get_summarized_tail_block_offset(struct slab_summary_zone *summary_zone, - slab_count_t slab_number); - -/** - * Whether ref_counts must be loaded from the layer. - * - * @param summary_zone The slab_summary_zone to use - * @param slab_number The slab number to get information for - * - * @return Whether ref_counts must be loaded - **/ -bool __must_check vdo_must_load_ref_counts(struct slab_summary_zone *summary_zone, - slab_count_t slab_number); - -/** - * Get the stored cleanliness information for a single slab. - * - * @param summary_zone The slab_summary_zone to use - * @param slab_number The slab number to get information for - * - * @return Whether the slab is clean - **/ -bool __must_check -vdo_get_summarized_cleanliness(struct slab_summary_zone *summary_zone, - slab_count_t slab_number); - -/** - * Get the stored emptiness information for a single slab. - * - * @param summary_zone The slab_summary_zone to use - * @param slab_number The slab number to get information for - * - * @return An approximation to the free blocks in the slab - **/ -block_count_t __must_check -get_summarized_free_block_count(struct slab_summary_zone *summary_zone, - slab_count_t slab_number); - -/** - * Get the stored ref_counts state information for a single slab. Used - * in testing only. - * - * @param [in] summary_zone The slab_summary_zone to use - * @param [in] slab_number The slab number to get information for - * @param [out] free_block_hint The approximate number of free blocks - * @param [out] is_clean Whether the slab is clean - **/ -void vdo_get_summarized_ref_counts_state(struct slab_summary_zone *summary_zone, - slab_count_t slab_number, - size_t *free_block_hint, - bool *is_clean); - -/** - * Get the stored slab statuses for all slabs in a zone. - * - * @param [in] summary_zone The slab_summary_zone to use - * @param [in] slab_count The number of slabs to fetch - * @param [in,out] statuses An array of slab_status structures to populate - **/ -void vdo_get_summarized_slab_statuses(struct slab_summary_zone *summary_zone, - slab_count_t slab_count, - struct slab_status *statuses); - -/** - * Set the origin of the slab summary relative to the physical layer. - * - * @param summary The slab_summary to update - * @param partition The slab summary partition - **/ -void set_vdo_slab_summary_origin(struct slab_summary *summary, - struct partition *partition); - -/** - * Read in all the slab summary data from the slab summary partition, - * combine all the previously used zones into a single zone, and then - * write the combined summary back out to each possible zones' summary - * region. - * - * @param summary The summary to load - * @param operation The type of load to perform - * @param zones_to_combine The number of zones to be combined; if set to 0, - * all of the summary will be initialized as new. - * @param parent The parent of this operation - **/ -void load_vdo_slab_summary(struct slab_summary *summary, - const struct admin_state_code *operation, - zone_count_t zones_to_combine, - struct vdo_completion *parent); - -/** - * Fetch the cumulative statistics for all slab summary zones in a summary. - * - * @param summary The summary in question - * - * @return the cumulative slab summary statistics for the summary - **/ -struct slab_summary_statistics __must_check -get_vdo_slab_summary_statistics(const struct slab_summary *summary); - -#endif // SLAB_SUMMARY_H diff --git a/vdo/slabSummaryFormat.h b/vdo/slabSummaryFormat.h deleted file mode 100644 index ded219da..00000000 --- a/vdo/slabSummaryFormat.h +++ /dev/null @@ -1,104 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/slabSummaryFormat.h#6 $ - */ - -#ifndef SLAB_SUMMARY_FORMAT_H -#define SLAB_SUMMARY_FORMAT_H - -#include "constants.h" -#include "types.h" - -/** - * The offset of a slab journal tail block. - **/ -typedef uint8_t tail_block_offset_t; - -enum { - VDO_SLAB_SUMMARY_FULLNESS_HINT_BITS = 6, -}; - -struct slab_summary_entry { - /** Bits 7..0: The offset of the tail block within the slab journal */ - tail_block_offset_t tail_block_offset; - -#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ - /** Bits 13..8: A hint about the fullness of the slab */ - unsigned int fullness_hint : 6; - /** Bit 14: Whether the ref_counts must be loaded from the layer */ - unsigned int load_ref_counts : 1; - /** Bit 15: The believed cleanliness of this slab */ - unsigned int is_dirty : 1; -#else - /** Bit 15: The believed cleanliness of this slab */ - unsigned int is_dirty : 1; - /** Bit 14: Whether the ref_counts must be loaded from the layer */ - unsigned int load_ref_counts : 1; - /** Bits 13..8: A hint about the fullness of the slab */ - unsigned int fullness_hint : 6; -#endif -} __packed; - -// XXX: These methods shouldn't take a block_size parameter. - -/** - * Returns the size on disk of a single zone of the slab_summary. - * - * @param block_size The block size of the physical layer - * - * @return the number of blocks required to store a single zone of the - * slab_summary on disk - **/ -static inline block_count_t __must_check -get_vdo_slab_summary_zone_size(block_size_t block_size) -{ - slab_count_t entries_per_block = - block_size / sizeof(struct slab_summary_entry); - block_count_t blocks_needed = MAX_VDO_SLABS / entries_per_block; - return blocks_needed; -} - -/** - * Returns the size on disk of the slab_summary structure. - * - * @param block_size The block size of the physical layer - * - * @return the blocks required to store the slab_summary on disk - **/ -static inline block_count_t __must_check -get_vdo_slab_summary_size(block_size_t block_size) -{ - return get_vdo_slab_summary_zone_size(block_size) * MAX_VDO_PHYSICAL_ZONES; -} - -/** - * Computes the shift for slab summary hints. - * - * @param slab_size_shift Exponent for the number of blocks per slab - * - * @return The hint shift - **/ -static inline uint8_t __must_check -get_vdo_slab_summary_hint_shift(unsigned int slab_size_shift) -{ - return ((slab_size_shift > VDO_SLAB_SUMMARY_FULLNESS_HINT_BITS) ? - (slab_size_shift - VDO_SLAB_SUMMARY_FULLNESS_HINT_BITS) : 0); -} - -#endif // SLAB_SUMMARY_FORMAT_H diff --git a/vdo/slabSummaryInternals.h b/vdo/slabSummaryInternals.h deleted file mode 100644 index 504fe298..00000000 --- a/vdo/slabSummaryInternals.h +++ /dev/null @@ -1,109 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/slabSummaryInternals.h#4 $ - */ - -#ifndef SLAB_SUMMARY_INTERNALS_H -#define SLAB_SUMMARY_INTERNALS_H - -#include "slabSummary.h" - -#include - -#include "adminState.h" -#include "slabSummaryFormat.h" - -struct slab_summary_block { - /** The zone to which this block belongs */ - struct slab_summary_zone *zone; - /** The index of this block in its zone's summary */ - block_count_t index; - /** Whether this block has a write outstanding */ - bool writing; - /** Ring of updates waiting on the outstanding write */ - struct wait_queue current_update_waiters; - /** Ring of updates waiting on the next write */ - struct wait_queue next_update_waiters; - /** The active slab_summary_entry array for this block */ - struct slab_summary_entry *entries; - /** The vio used to write this block */ - struct vio *vio; - /** The packed entries, one block long, backing the vio */ - char *outgoing_entries; -}; - -/** - * The statistics for all the slab summary zones owned by this slab summary. - * These fields are all mutated only by their physical zone threads, but are - * read by other threads when gathering statistics for the entire depot. - **/ -struct atomic_slab_summary_statistics { - /** Number of blocks written */ - atomic64_t blocks_written; -}; - -struct slab_summary_zone { - /** The summary of which this is a zone */ - struct slab_summary *summary; - /** The number of this zone */ - zone_count_t zone_number; - /** Count of the number of blocks currently out for writing */ - block_count_t write_count; - /** The state of this zone */ - struct admin_state state; - /** The array (owned by the blocks) of all entries */ - struct slab_summary_entry *entries; - /** The array of slab_summary_blocks */ - struct slab_summary_block summary_blocks[]; -}; - -struct slab_summary { - /** The context for entering read-only mode */ - struct read_only_notifier *read_only_notifier; - /** The statistics for this slab summary */ - struct atomic_slab_summary_statistics statistics; - /** The start of the slab summary partition relative to the layer */ - physical_block_number_t origin; - /** The number of bits to shift to get a 7-bit fullness hint */ - unsigned int hint_shift; - /** The number of blocks (calculated based on MAX_VDO_SLABS) */ - block_count_t blocks_per_zone; - /** The number of slabs per block (calculated from block size) */ - slab_count_t entries_per_block; - /** The entries for all of the zones the partition can hold */ - struct slab_summary_entry *entries; - /** - * The number of zones which were active at the time of the last update - */ - zone_count_t zones_to_combine; - /** The current number of active zones */ - zone_count_t zone_count; - /** The currently active zones */ - struct slab_summary_zone *zones[]; -}; - -/** - * Treating the current entries buffer as the on-disk value of all zones, - * update every zone to the correct values for every slab. - * - * @param summary The summary whose entries should be combined - **/ -void vdo_slab_summary_combine_zones(struct slab_summary *summary); - -#endif // SLAB_SUMMARY_INTERNALS_H diff --git a/vdo/sparse-cache.c b/vdo/sparse-cache.c new file mode 100644 index 00000000..b018042c --- /dev/null +++ b/vdo/sparse-cache.c @@ -0,0 +1,1149 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright Red Hat + */ + +/** + * The sparse chapter index cache is implemented as a simple array of cache + * entries. Since the cache is small (seven chapters by default), searching + * for a specific virtual chapter is implemented as a linear search. The cache + * replacement policy is least-recently-used (LRU). Again, size of the cache + * allows the LRU order to be maintained by shifting entries in an array list. + * + * The most important property of this cache is the absence of synchronization + * for read operations. Safe concurrent access to the cache by the zone + * threads is controlled by the triage queue and the barrier requests it + * issues to the zone queues. The set of cached chapters does not and must not + * change between the carefully coordinated calls to update_sparse_cache() from + * the zone threads. + * + * The critical invariant for that coordination is the cache membership must + * not change between those updates; the calls to sparse_cache_contains() from + * the zone threads must all receive the same results for any virtual chapter + * number. To ensure that critical invariant, state changes such as "that + * virtual chapter is no longer in the volume" and "skip searching that + * chapter because it has had too many cache misses" are represented + * separately from the cache membership information (the virtual chapter + * number). + * + * As a result of this invariant, we have the guarantee that every zone thread + * will call update_sparse_cache() once and exactly once to request a chapter + * that is not in the cache, and the serialization of the barrier requests + * from the triage queue ensures they will all request the same chapter + * number. This means the only synchronization we need can be provided by a + * pair of thread barriers used only in the update_sparse_cache() call, + * providing a critical section where a single zone thread can drive the cache + * update while all the other zone threads are known to be blocked, waiting in + * the second barrier. Outside that critical section, all the zone threads + * implicitly hold a shared lock. Inside it, the "captain" (the thread that + * was uniquely flagged when passing through the first barrier) holds an + * exclusive lock. No other threads may access or modify the cache, except for + * accessing cache statistics and similar queries. + * + * Cache statistics must only be modified by a single thread, conventionally + * the zone zero thread. All fields that might be frequently updated by that + * thread are kept in separate cache-aligned structures so they will not cause + * cache contention via "false sharing" with the fields that are frequently + * accessed by all of the zone threads. + * + * LRU order is kept independently by each zone thread, and each zone uses its + * own list for searching and cache membership queries. The zone zero list is + * used to decide which chapter to evict when the cache is updated, and its + * search list is copied to the other threads at that time. + * + * The virtual chapter number field of the cache entry is the single field + * indicating whether a chapter is a member of the cache or not. The value + * UINT64_MAX is used to represent a null, undefined, or wildcard + * chapter number. When present in the virtual chapter number field + * cached_chapter_index, it indicates that the cache entry is dead, and all + * the other fields of that entry (other than immutable pointers to cache + * memory) are undefined and irrelevant. Any cache entry that is not marked as + * dead is fully defined and a member of the cache--sparse_cache_contains() + * must always return true for any virtual chapter number that appears in any + * of the cache entries. + * + * A chapter index that is a member of the cache may be marked for different + * treatment (disabling search) between calls to update_sparse_cache() in two + * different ways. When a chapter falls off the end of the volume, its virtual + * chapter number will be less that the oldest virtual chapter number. Since + * that chapter is no longer part of the volume, there's no point in continuing + * to search that chapter index. Once invalidated, that virtual chapter will + * still be considered a member of the cache, but it will no longer be searched + * for matching chunk names. + * + * The second mechanism for disabling search is the heuristic based on keeping + * track of the number of consecutive search misses in a given chapter index. + * Once that count exceeds a threshold, the skip_search flag will be set to + * true, causing the chapter to be skipped in the fallback search of the + * entire cache, but still allowing it to be found when searching for a hook + * in that specific chapter. Finding a hook will clear the skip_search flag, + * once again allowing the non-hook searches to use the cache entry. Again, + * regardless of the state of the skip_search flag, the virtual chapter must + * still considered to be a member of the cache for sparse_cache_contains(). + **/ + +#include "sparse-cache.h" + +#include "chapter-index.h" +#include "common.h" +#include "config.h" +#include "index.h" +#include "logger.h" +#include "memory-alloc.h" +#include "permassert.h" +#include "uds-threads.h" + +enum { + /** The number of consecutive search misses that will disable searching + */ + SKIP_SEARCH_THRESHOLD = 20000, + + /** a named constant to use when identifying zone zero */ + ZONE_ZERO = 0 +}; + +/** + * These counters are essentially fields of the struct cached_chapter_index, + * but are segregated into this structure because they are frequently modified. + * They are grouped and aligned to keep them on different cache lines from the + * chapter fields that are accessed far more often than they are updated. + **/ +struct __attribute__((aligned(CACHE_LINE_BYTES))) cached_index_counters { + /** the total number of search hits since this chapter was cached */ + uint64_t search_hits; + + /** the total number of search misses since this chapter was cached */ + uint64_t search_misses; + + /** the number of consecutive search misses since the last cache hit */ + uint64_t consecutive_misses; +}; + +/** + * struct cached_chapter_index is the structure for a cache entry, representing + * a single cached chapter index in the sparse chapter index cache. + **/ +struct __attribute__((aligned(CACHE_LINE_BYTES))) cached_chapter_index { + /* + * The virtual chapter number of the cached chapter index. UINT64_MAX + * means this cache entry is unused. Must only be modified in the + * critical section in updateSparseCache(). + */ + uint64_t virtual_chapter; + + /* The number of index pages in a chapter */ + unsigned int index_pages_count; + + /* + * This flag is mutable between cache updates, but it rarely changes + * and is frequently accessed, so it groups with the immutable fields. + * + * If set, skip the chapter when searching the entire cache. This flag + * is just a performance optimization. If we do not see a recent + * change to it, it will be corrected when we pass through a memory + * barrier while getting the next request from the queue. So we may do + * one extra search of the chapter index, or miss one deduplication + * opportunity. + */ + bool skip_search; + + /* + * These pointers are immutable during the life of the cache. The + * contents of the arrays change when the cache entry is replaced. + */ + + /* pointer to a cache-aligned array of ChapterIndexPages */ + struct delta_index_page *index_pages; + + /* pointer to an array of volume pages containing the index pages */ + struct volume_page *volume_pages; + + /* + * The cache-aligned counters change often and are placed at the end of + * the structure to prevent false sharing with the more stable fields + * above. + */ + + /* counter values updated by the thread servicing zone zero */ + struct cached_index_counters counters; +}; + +/** + * A search_list represents the permutations of the sparse chapter index cache + * entry array. Those permutations express an ordering on the chapter indexes, + * from most recently accessed to least recently accessed, which is the order + * in which the indexes should be searched and the reverse order in which they + * should be evicted from the cache (LRU cache replacement policy). + * + * Cache entries that are dead (virtual_chapter == UINT64_MAX) are kept as a + * suffix of the list, avoiding the need to even iterate over them to search, + * and ensuring that dead entries are replaced before any live entries are + * evicted. + * + * The search list is intended to be instantated for each zone thread, + * avoiding any need for synchronization. The structure is allocated on a + * cache boundary to avoid false sharing of memory cache lines between zone + * threads. + **/ +struct search_list { + /** The number of cached chapter indexes and search list entries */ + uint8_t capacity; + + /** The index in the entries array of the first dead cache entry */ + uint8_t first_dead_entry; + + /** The chapter array indexes representing the chapter search order */ + uint8_t entries[]; +}; + +/** + * search_list_iterator captures the fields needed to iterate over the live + * entries in a search list and return the struct cached_chapter_index pointers + * that the search code actually wants to deal with. + **/ +struct search_list_iterator { + /** The search list defining the chapter search iteration order */ + struct search_list *list; + + /** The index of the next entry to return from the search list */ + unsigned int next_entry; + + /** The cached chapters that are referenced by the search list */ + struct cached_chapter_index *chapters; +}; + +/** + * These counter values are essentially fields of the sparse_cache, but are + * segregated into this structure because they are frequently modified. We + * group them and align them to keep them on different cache lines from the + * cache fields that are accessed far more often than they are updated. + **/ +struct sparse_cache_counters { + /** the total number of virtual chapter probes that succeeded */ + uint64_t chapter_hits; + + /** the total number of virtual chapter probes that failed */ + uint64_t chapter_misses; + + /** the total number of cache searches that found a possible match */ + uint64_t search_hits; + + /** the total number of cache searches that found no matches */ + uint64_t search_misses; + + /** the number of cache entries that fell off the end of the volume */ + uint64_t invalidations; + + /** the number of cache entries that were evicted while still valid */ + uint64_t evictions; +} __attribute__((aligned(CACHE_LINE_BYTES))); + +/** + * This is the private structure definition of a sparse_cache. + **/ +struct sparse_cache { + /** the number of cache entries, which is the size of the chapters + * array */ + unsigned int capacity; + + /** the number of zone threads using the cache */ + unsigned int zone_count; + + /** the geometry governing the volume */ + const struct geometry *geometry; + + /** the number of search misses in zone zero that will disable + * searching */ + unsigned int skip_search_threshold; + + /** pointers to the cache-aligned chapter search order for each zone */ + struct search_list *search_lists[MAX_ZONES]; + + /** the thread barriers used to synchronize the zone threads for update + */ + struct barrier begin_cache_update; + struct barrier end_cache_update; + + /** frequently-updated counter fields (cache-aligned) */ + struct sparse_cache_counters counters; + + /** the counted array of chapter index cache entries (cache-aligned) */ + struct cached_chapter_index chapters[]; +}; + +/** + * Initialize a struct cached_chapter_index, allocating the memory for the + * array of ChapterIndexPages and the raw index page data. The chapter index + * will be marked as unused (virtual_chapter == UINT64_MAX). + * + * @param chapter the chapter index cache entry to initialize + * @param geometry the geometry governing the volume + **/ +static int __must_check +initialize_cached_chapter_index(struct cached_chapter_index *chapter, + const struct geometry *geometry) +{ + int result; + unsigned int i; + + chapter->virtual_chapter = UINT64_MAX; + chapter->index_pages_count = geometry->index_pages_per_chapter; + + result = UDS_ALLOCATE(chapter->index_pages_count, + struct delta_index_page, + __func__, + &chapter->index_pages); + if (result != UDS_SUCCESS) { + return result; + } + + result = UDS_ALLOCATE(chapter->index_pages_count, + struct volume_page, + "sparse index volume pages", + &chapter->volume_pages); + if (result != UDS_SUCCESS) { + return result; + } + + for (i = 0; i < chapter->index_pages_count; i++) { + result = initialize_volume_page(geometry->bytes_per_page, + &chapter->volume_pages[i]); + if (result != UDS_SUCCESS) { + return result; + } + } + return UDS_SUCCESS; +} + +/** + * Allocate and initialize a new chapter cache search list with the same + * capacity as the cache. The index of each entry in the cache will appear + * exactly once in the array. All the chapters in the cache are assumed to be + * initially dead, so first_dead_entry will be zero and no chapters will be + * returned when the search list is iterated. + * + * @param [in] capacity the number of entries in the search list + * @param [out] list_ptr a pointer in which to return the new search list + **/ +static int __must_check make_search_list(unsigned int capacity, + struct search_list **list_ptr) +{ + struct search_list *list; + unsigned int bytes; + uint8_t i; + int result; + + if (capacity == 0) { + return uds_log_error_strerror(UDS_INVALID_ARGUMENT, + "search list must have entries"); + } + if (capacity > UINT8_MAX) { + return uds_log_error_strerror(UDS_INVALID_ARGUMENT, + "search list capacity must fit in 8 bits"); + } + + /* + * We need three temporary entry arrays for purge_search_list(). + * Allocate them contiguously with the main array. + */ + bytes = sizeof(struct search_list) + (4 * capacity * sizeof(uint8_t)); + result = uds_allocate_cache_aligned(bytes, "search list", &list); + if (result != UDS_SUCCESS) { + return result; + } + + list->capacity = capacity; + list->first_dead_entry = 0; + + /* + * Fill in the indexes of the chapter index cache entries. These will + * be only ever be permuted as the search list is used. + */ + for (i = 0; i < capacity; i++) { + list->entries[i] = i; + } + + *list_ptr = list; + return UDS_SUCCESS; +} + +/** + * Initialize a sparse chapter index cache. + * + * @param cache the sparse cache to initialize + * @param geometry the geometry governing the volume + * @param capacity the number of chapters the cache will hold + * @param zone_count the number of zone threads using the cache + * + * @return UDS_SUCCESS or an error code + **/ +static int __must_check initialize_sparse_cache(struct sparse_cache *cache, + const struct geometry *geometry, + unsigned int capacity, + unsigned int zone_count) +{ + unsigned int i; + int result; + + cache->geometry = geometry; + cache->capacity = capacity; + cache->zone_count = zone_count; + + /* + * Scale down the skip threshold by the number of zones since we count + * the chapter search misses only in zone zero. + */ + cache->skip_search_threshold = (SKIP_SEARCH_THRESHOLD / zone_count); + + result = uds_initialize_barrier(&cache->begin_cache_update, zone_count); + if (result != UDS_SUCCESS) { + return result; + } + result = uds_initialize_barrier(&cache->end_cache_update, zone_count); + if (result != UDS_SUCCESS) { + return result; + } + for (i = 0; i < capacity; i++) { + result = initialize_cached_chapter_index(&cache->chapters[i], + geometry); + if (result != UDS_SUCCESS) { + return result; + } + } + + /* Allocate each zone's independent LRU order. */ + for (i = 0; i < zone_count; i++) { + result = make_search_list(capacity, &cache->search_lists[i]); + if (result != UDS_SUCCESS) { + return result; + } + } + return UDS_SUCCESS; +} + +int make_sparse_cache(const struct geometry *geometry, + unsigned int capacity, + unsigned int zone_count, + struct sparse_cache **cache_ptr) +{ + unsigned int bytes = + (sizeof(struct sparse_cache) + + (capacity * sizeof(struct cached_chapter_index))); + + struct sparse_cache *cache; + int result = uds_allocate_cache_aligned(bytes, "sparse cache", &cache); + + if (result != UDS_SUCCESS) { + return result; + } + + result = + initialize_sparse_cache(cache, geometry, capacity, zone_count); + if (result != UDS_SUCCESS) { + free_sparse_cache(cache); + return result; + } + + *cache_ptr = cache; + return UDS_SUCCESS; +} + +size_t get_sparse_cache_memory_size(const struct sparse_cache *cache) +{ + /* + * Count the delta_index_page as cache memory, but ignore all other + * overhead. + */ + size_t page_size = (sizeof(struct delta_index_page) + + cache->geometry->bytes_per_page); + size_t chapter_size = + (page_size * cache->geometry->index_pages_per_chapter); + return (cache->capacity * chapter_size); +} + +/** + * Assign a new value to the skip_search flag of a cached chapter index. + * + * @param chapter the chapter index cache entry to modify + * @param skip_search the new value of the skip_search falg + **/ +static INLINE void set_skip_search(struct cached_chapter_index *chapter, + bool skip_search) +{ + /* + * Explicitly check if the field is set so we don't keep dirtying the + * memory cache line on continued search hits. + */ + if (READ_ONCE(chapter->skip_search) != skip_search) { + WRITE_ONCE(chapter->skip_search, skip_search); + } +} + +/** + * Update counters to reflect a chapter access hit and clear the skip_search + * flag on the chapter, if set. + * + * @param cache the cache to update + * @param chapter the cache entry to update + **/ +static void score_chapter_hit(struct sparse_cache *cache, + struct cached_chapter_index *chapter) +{ + cache->counters.chapter_hits += 1; + set_skip_search(chapter, false); +} + +/** + * Update counters to reflect a chapter access miss. + * + * @param cache the cache to update + **/ +static void score_chapter_miss(struct sparse_cache *cache) +{ + cache->counters.chapter_misses += 1; +} + +/** + * Check if the cache entry that is about to be replaced is already dead, and + * if it's not, add to tally of evicted or invalidated cache entries. + * + * @param zone the zone used to find the oldest chapter + * @param cache the cache to update + * @param chapter the cache entry about to be replaced + **/ +static void score_eviction(struct index_zone *zone, + struct sparse_cache *cache, + struct cached_chapter_index *chapter) +{ + if (chapter->virtual_chapter == UINT64_MAX) { + return; + } + if (chapter->virtual_chapter < zone->oldest_virtual_chapter) { + cache->counters.invalidations += 1; + } else { + cache->counters.evictions += 1; + } +} + +/** + * Update counters to reflect a cache search hit. This bumps the hit + * count, clears the miss count, and clears the skip_search flag. + * + * @param cache the cache to update + * @param chapter the cache entry to update + **/ +static void score_search_hit(struct sparse_cache *cache, + struct cached_chapter_index *chapter) +{ + cache->counters.search_hits += 1; + chapter->counters.search_hits += 1; + chapter->counters.consecutive_misses = 0; + set_skip_search(chapter, false); +} + +/** + * Update counters to reflect a cache search miss. This bumps the consecutive + * miss count, and if it goes over skip_search_threshold, sets the skip_search + * flag on the chapter. + * + * @param cache the cache to update + * @param chapter the cache entry to update + **/ +static void score_search_miss(struct sparse_cache *cache, + struct cached_chapter_index *chapter) +{ + cache->counters.search_misses += 1; + chapter->counters.search_misses += 1; + chapter->counters.consecutive_misses += 1; + if (chapter->counters.consecutive_misses > + cache->skip_search_threshold) { + set_skip_search(chapter, true); + } +} + +/** + * Destroy a cached_chapter_index, freeing the memory allocated for the + * ChapterIndexPages and raw index page data. + * + * @param chapter the chapter index cache entry to destroy + **/ +static void destroy_cached_chapter_index(struct cached_chapter_index *chapter) +{ + if (chapter->volume_pages != NULL) { + unsigned int i; + + for (i = 0; i < chapter->index_pages_count; i++) { + destroy_volume_page(&chapter->volume_pages[i]); + } + } + UDS_FREE(chapter->index_pages); + UDS_FREE(chapter->volume_pages); +} + +void free_sparse_cache(struct sparse_cache *cache) +{ + unsigned int i; + + if (cache == NULL) { + return; + } + + for (i = 0; i < cache->zone_count; i++) { + UDS_FREE(UDS_FORGET(cache->search_lists[i])); + } + + for (i = 0; i < cache->capacity; i++) { + struct cached_chapter_index *chapter = &cache->chapters[i]; + + destroy_cached_chapter_index(chapter); + } + + uds_destroy_barrier(&cache->begin_cache_update); + uds_destroy_barrier(&cache->end_cache_update); + UDS_FREE(cache); +} + + +/** + * Prepare to iterate over the live cache entries a search list. + * + * @param list the list defining the live chapters and the search order + * @param chapters the chapter index entries to return from get_next_chapter() + * + * @return an iterator positioned at the start of the search list + **/ +static INLINE struct search_list_iterator +iterate_search_list(struct search_list *list, + struct cached_chapter_index chapters[]) +{ + struct search_list_iterator iterator = { + .list = list, + .next_entry = 0, + .chapters = chapters, + }; + return iterator; +} + +/** + * Check if the search list iterator has another entry to return. + * + * @param iterator the search list iterator + * + * @return true if get_next_chapter() may be called + **/ +static INLINE bool +has_next_chapter(const struct search_list_iterator *iterator) +{ + return (iterator->next_entry < iterator->list->first_dead_entry); +} + +/** + * Return a pointer to the next live chapter in the search list iteration and + * advance the iterator. This must only be called when has_next_chapter() + * returns true. + * + * @param iterator the search list iterator + * + * @return a pointer to the next live chapter index in the search list order + **/ +static INLINE struct cached_chapter_index * +get_next_chapter(struct search_list_iterator *iterator) +{ + return &iterator->chapters[iterator->list + ->entries[iterator->next_entry++]]; +} + +/** + * Rotate the pointers in a prefix of a search list downwards by one item, + * pushing elements deeper into the list and moving a new chapter to the start + * of the search list. This is the "make most recent" operation on the search + * list. + * + * If the search list provided is [ 0 1 2 3 4 ] and the prefix + * length is 4, then 3 is being moved to the front. + * The search list after the call will be [ 3 0 1 2 4 ] and the + * function will return 3. + * + * @param search_list the chapter index search list to rotate + * @param prefix_length the length of the prefix of the list to rotate + * + * @return the array index of the chapter cache entry that is now at the front + * of the search list + **/ +static INLINE uint8_t rotate_search_list(struct search_list *search_list, + uint8_t prefix_length) +{ + /* Grab the value of the last entry in the list prefix. */ + uint8_t most_recent = search_list->entries[prefix_length - 1]; + + if (prefix_length > 1) { + /* + * Push the first N-1 entries down by one entry, overwriting + * the entry we just grabbed. + */ + memmove(&search_list->entries[1], + &search_list->entries[0], + prefix_length - 1); + + /* + * We now have a hole at the front of the list in which we can + * place the rotated entry. + */ + search_list->entries[0] = most_recent; + } + + /* + * This function is also used to move a dead chapter to the front of + * the list, in which case the suffix of dead chapters was pushed down + * too. + */ + if (search_list->first_dead_entry < prefix_length) { + search_list->first_dead_entry += 1; + } + + return most_recent; +} + +bool sparse_cache_contains(struct sparse_cache *cache, + uint64_t virtual_chapter, + unsigned int zone_number) +{ + /* + * The correctness of the barriers depends on the invariant that + * between calls to update_sparse_cache(), the answers this function + * returns must never vary--the result for a given chapter must be + * identical across zones. That invariant must be maintained even if + * the chapter falls off the end of the volume, or if searching it is + * disabled because of too many search misses. + */ + + /* Get the chapter search order for this zone thread. */ + struct search_list_iterator iterator = + iterate_search_list(cache->search_lists[zone_number], + cache->chapters); + while (has_next_chapter(&iterator)) { + struct cached_chapter_index *chapter = + get_next_chapter(&iterator); + if (virtual_chapter == chapter->virtual_chapter) { + if (zone_number == ZONE_ZERO) { + score_chapter_hit(cache, chapter); + } + + /* Move the chapter to the front of the search list. */ + rotate_search_list(iterator.list, iterator.next_entry); + return true; + } + } + + /* The specified virtual chapter isn't cached. */ + if (zone_number == ZONE_ZERO) { + score_chapter_miss(cache); + } + return false; +} + +/** + * Purge invalid cache entries, marking them as dead and moving them to the + * end of the search list, then push any chapters that have skip_search set + * down so they follow all the remaining live, valid chapters in the search + * list. This effectively sorts the search list into three regions--active, + * skippable, and dead--while maintaining the LRU ordering that already + * existed (a stable sort). + * + * This operation must only be called during the critical section in + * update_sparse_cache() since it effectively changes cache membership. + * + * @param search_list the chapter index search list to purge + * @param chapters the chapter index cache entries + * @param oldest_virtual_chapter the oldest virtual chapter + **/ +static void purge_search_list(struct search_list *search_list, + const struct cached_chapter_index chapters[], + uint64_t oldest_virtual_chapter) +{ + uint8_t *entries, *alive, *skipped, *dead; + unsigned int next_alive, next_skipped, next_dead; + int i; + + if (search_list->first_dead_entry == 0) { + /* There are no live entries in the list to purge. */ + return; + } + + /* + * Partition the previously-alive entries in the list into three + * temporary lists, keeping the current LRU search order within each + * list. The element array was allocated with enough space for all four + * lists. + */ + entries = &search_list->entries[0]; + alive = &entries[search_list->capacity]; + skipped = &alive[search_list->capacity]; + dead = &skipped[search_list->capacity]; + next_alive = next_skipped = next_dead = 0; + + for (i = 0; i < search_list->first_dead_entry; i++) { + uint8_t entry = entries[i]; + const struct cached_chapter_index *chapter = &chapters[entry]; + + if ((chapter->virtual_chapter < oldest_virtual_chapter) || + (chapter->virtual_chapter == UINT64_MAX)) { + dead[next_dead++] = entry; + } else if (chapter->skip_search) { + skipped[next_skipped++] = entry; + } else { + alive[next_alive++] = entry; + } + } + + /* + * Copy the temporary lists back to the search list so we wind up with + * [ alive, alive, skippable, new-dead, new-dead, old-dead, old-dead ] + */ + memcpy(entries, alive, next_alive); + entries += next_alive; + + memcpy(entries, skipped, next_skipped); + entries += next_skipped; + + memcpy(entries, dead, next_dead); + /* The first dead entry is now the start of the copied dead list. */ + search_list->first_dead_entry = (next_alive + next_skipped); +} + +/** + * Cache a chapter index, reading all the index pages from the volume and + * initializing the array of ChapterIndexPages in the cache entry to represent + * them. The virtual_chapter field of the cache entry will be set to UINT64_MAX + * if there is any error since the remaining mutable fields will be in an + * undefined state. + * + * @param chapter the chapter index cache entry to replace + * @param virtual_chapter the virtual chapter number of the index to read + * @param volume the volume containing the chapter index + * + * @return UDS_SUCCESS or an error code + **/ +static int __must_check +cache_chapter_index(struct cached_chapter_index *chapter, + uint64_t virtual_chapter, + const struct volume *volume) +{ + int result; + /* Mark the cached chapter as unused in case the update fails midway. */ + chapter->virtual_chapter = UINT64_MAX; + + /* + * Read all the page data and initialize the entire delta_index_page + * array. (It's not safe for the zone threads to do it lazily--they'll + * race.) + */ + result = read_chapter_index_from_volume(volume, + virtual_chapter, + chapter->volume_pages, + chapter->index_pages); + if (result != UDS_SUCCESS) { + return result; + } + + /* Reset all chapter counter values to zero. */ + chapter->counters.search_hits = 0; + chapter->counters.search_misses = 0; + chapter->counters.consecutive_misses = 0; + + /* Mark the entry as valid--it's now in the cache. */ + chapter->virtual_chapter = virtual_chapter; + chapter->skip_search = false; + + return UDS_SUCCESS; +} + +/** + * Copy the contents of one search list to another. + * + * @param source the list to copy + * @param target the list to replace + **/ +static INLINE void copy_search_list(const struct search_list *source, + struct search_list *target) +{ + *target = *source; + memcpy(target->entries, source->entries, source->capacity); +} + +int update_sparse_cache(struct index_zone *zone, uint64_t virtual_chapter) +{ + int result = UDS_SUCCESS; + const struct uds_index *index = zone->index; + struct sparse_cache *cache = index->volume->sparse_cache; + + /* + * If the chapter is already in the cache, we don't need to do a thing + * except update the search list order, which this check does. + */ + if (sparse_cache_contains(cache, virtual_chapter, zone->id)) { + return UDS_SUCCESS; + } + + /* + * Wait for every zone thread to have reached its corresponding barrier + * request and invoked this function before starting to modify the + * cache. + */ + uds_enter_barrier(&cache->begin_cache_update, NULL); + + /* + * This is the start of the critical section: the zone zero thread is + * captain, effectively holding an exclusive lock on the sparse cache. + * All the other zone threads must do nothing between the two barriers. + * They will wait at the end_cache_update barrier for the captain to + * finish the update. + */ + + if (zone->id == ZONE_ZERO) { + unsigned int z; + /* Purge invalid chapters from the LRU search list. */ + struct search_list *zone_zero_list = + cache->search_lists[ZONE_ZERO]; + purge_search_list(zone_zero_list, + cache->chapters, + zone->oldest_virtual_chapter); + + /* + * First check that the desired chapter is still in the volume. + * If it's not, the hook fell out of the index and there's + * nothing to do for it. + */ + if (virtual_chapter >= index->oldest_virtual_chapter) { + /* + * Evict the least recently used live chapter, or + * replace a dead cache entry, all by rotating the the + * last list entry to the front. + */ + struct cached_chapter_index *victim = + &cache->chapters[rotate_search_list(zone_zero_list, + cache->capacity)]; + + /* + * Check if the victim is already dead, and if it's + * not, add to the tally of evicted or invalidated + * cache entries. + */ + score_eviction(zone, cache, victim); + + /* + * Read the index page bytes and initialize the page + * array. + */ + result = cache_chapter_index(victim, virtual_chapter, + index->volume); + } + + /* + * Copy the new search list state to all the other zone threads + * so they'll get the result of pruning and see the new + * chapter. + */ + for (z = 1; z < cache->zone_count; z++) { + copy_search_list(zone_zero_list, + cache->search_lists[z]); + } + } + + /* + * This is the end of the critical section. All cache invariants must + * have been restored--it will be shared/read-only again beyond the + * barrier. + */ + + uds_enter_barrier(&cache->end_cache_update, NULL); + return result; +} + +/** + * Release the all cached page data for a cached_chapter_index. + * + * @param chapter the chapter index cache entry to release + **/ +static void release_cached_chapter_index(struct cached_chapter_index *chapter) +{ + if (chapter->volume_pages != NULL) { + unsigned int i; + + for (i = 0; i < chapter->index_pages_count; i++) { + release_volume_page(&chapter->volume_pages[i]); + } + } +} + +void invalidate_sparse_cache(struct sparse_cache *cache) +{ + unsigned int i; + + if (cache == NULL) { + return; + } + for (i = 0; i < cache->capacity; i++) { + struct cached_chapter_index *chapter = &cache->chapters[i]; + + chapter->virtual_chapter = UINT64_MAX; + release_cached_chapter_index(chapter); + } +} + +/** + * Check if a cached sparse chapter index should be skipped over in the search + * for a chunk name. Filters out unused, invalid, disabled, and irrelevant + * cache entries. + * + * @param zone the zone doing the check + * @param chapter the cache entry search candidate + * @param virtual_chapter the virtual_chapter containing a hook, or UINT64_MAX + * if searching the whole cache for a non-hook + * + * @return true if the provided chapter index should be skipped + **/ +static INLINE bool +should_skip_chapter_index(const struct index_zone *zone, + const struct cached_chapter_index *chapter, + uint64_t virtual_chapter) +{ + /* + * Don't search unused entries (contents undefined) or invalid entries + * (the chapter is no longer the zone's view of the volume). + */ + if ((chapter->virtual_chapter == UINT64_MAX) || + (chapter->virtual_chapter < zone->oldest_virtual_chapter)) { + return true; + } + + if (virtual_chapter != UINT64_MAX) { + /* + * If the caller specified a virtual chapter, only search the + * cache entry containing that chapter. + */ + return (virtual_chapter != chapter->virtual_chapter); + } else { + /* + * When searching the entire cache, save time by skipping over + * chapters that have had too many consecutive misses. + */ + return READ_ONCE(chapter->skip_search); + } +} + +/** + * Search a single cached sparse chapter index for a chunk name, returning the + * record page number that may contain the name. + * + * @param [in] chapter the cache entry for the chapter to search + * @param [in] geometry the geometry governing the volume + * @param [in] index_page_map the index page number map for the volume + * @param [in] name the chunk name to search for + * @param [out] record_page_ptr the record page number of a match, else + * NO_CHAPTER_INDEX_ENTRY if nothing matched + * + * @return UDS_SUCCESS or an error code + **/ +static int __must_check +search_cached_chapter_index(struct cached_chapter_index *chapter, + const struct geometry *geometry, + const struct index_page_map *index_page_map, + const struct uds_chunk_name *name, + int *record_page_ptr) +{ + /* + * Find the index_page_number in the chapter that would have the chunk + * name. + */ + unsigned int physical_chapter = + map_to_physical_chapter(geometry, chapter->virtual_chapter); + unsigned int index_page_number = + find_index_page_number(index_page_map, name, physical_chapter); + + return search_chapter_index_page(&chapter->index_pages[index_page_number], + geometry, + name, + record_page_ptr); +} + +int search_sparse_cache(struct index_zone *zone, + const struct uds_chunk_name *name, + uint64_t *virtual_chapter_ptr, + int *record_page_ptr) +{ + struct volume *volume = zone->index->volume; + struct sparse_cache *cache = volume->sparse_cache; + unsigned int zone_number = zone->id; + /* + * If the caller did not specify a virtual chapter, search the entire + * cache. + */ + bool search_all = (*virtual_chapter_ptr == UINT64_MAX); + + /* + * Get the chapter search order for this zone thread, searching the + * chapters from most recently hit to least recently hit. + */ + struct search_list_iterator iterator = + iterate_search_list(cache->search_lists[zone_number], + cache->chapters); + while (has_next_chapter(&iterator)) { + int result; + struct cached_chapter_index *chapter = + get_next_chapter(&iterator); + + /* + * Skip chapters no longer cached, or that have too many search + * misses. + */ + if (should_skip_chapter_index(zone, chapter, + *virtual_chapter_ptr)) { + continue; + } + + result = search_cached_chapter_index(chapter, + cache->geometry, + volume->index_page_map, + name, + record_page_ptr); + if (result != UDS_SUCCESS) { + return result; + } + + /* Did we find an index entry for the name? */ + if (*record_page_ptr != NO_CHAPTER_INDEX_ENTRY) { + if (zone_number == ZONE_ZERO) { + score_search_hit(cache, chapter); + } + + /* Move the chapter to the front of the search list. */ + rotate_search_list(iterator.list, iterator.next_entry); + + /* + * Return a matching entry as soon as it is found. It + * might be a false collision that has a true match in + * another chapter, but that's a very rare case and not + * worth the extra search cost or complexity. + */ + *virtual_chapter_ptr = chapter->virtual_chapter; + return UDS_SUCCESS; + } + + if (zone_number == ZONE_ZERO) { + score_search_miss(cache, chapter); + } + + if (!search_all) { + /* + * We just searched the virtual chapter the caller + * specified and there was no match, so we're done. + */ + break; + } + } + + /* The name was not found in the cache. */ + *record_page_ptr = NO_CHAPTER_INDEX_ENTRY; + return UDS_SUCCESS; +} diff --git a/uds/sparseCache.h b/vdo/sparse-cache.h similarity index 82% rename from uds/sparseCache.h rename to vdo/sparse-cache.h index 4732c3e7..4eaaa141 100644 --- a/uds/sparseCache.h +++ b/vdo/sparse-cache.h @@ -1,31 +1,13 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/krusty/src/uds/sparseCache.h#12 $ */ #ifndef SPARSE_CACHE_H #define SPARSE_CACHE_H -#include "cacheCounters.h" #include "geometry.h" -#include "indexZone.h" -#include "typeDefs.h" +#include "type-defs.h" /** * sparse_cache is the cache of entire chapter indexes from sparse chapters @@ -41,8 +23,8 @@ **/ struct sparse_cache; -// Bare declaration to avoid include dependency loops. -struct uds_index; +/* Bare declaration to avoid include dependency loops. */ +struct index_zone; /** * Allocate and initialize a sparse chapter index cache. diff --git a/vdo/statistics.h b/vdo/statistics.h index a0e98ff9..b922c8a9 100644 --- a/vdo/statistics.h +++ b/vdo/statistics.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright Red Hat * diff --git a/vdo/statusCodes.c b/vdo/status-codes.c similarity index 71% rename from vdo/statusCodes.c rename to vdo/status-codes.c index 574f8fb4..cdd2fe14 100644 --- a/vdo/statusCodes.c +++ b/vdo/status-codes.c @@ -1,30 +1,14 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/statusCodes.c#18 $ */ -#include "statusCodes.h" +#include "status-codes.h" + #include "errors.h" #include "permassert.h" -#include "threadOnce.h" -#include "uds.h" +#include "uds-threads.h" const struct error_info vdo_status_list[] = { { "VDO_NOT_IMPLEMENTED", "Not implemented" }, @@ -70,15 +54,15 @@ const struct error_info vdo_status_list[] = { { "VDO_CANT_ADD_SYSFS_NODE", "Failed to add sysfs node" }, }; -static once_state_t vdo_status_codes_registered = ONCE_STATE_INITIALIZER; +static atomic_t vdo_status_codes_registered = ATOMIC_INIT(0); static int status_code_registration_result; -/**********************************************************************/ static void do_status_code_registration(void) { int result; + STATIC_ASSERT((VDO_STATUS_CODE_LAST - VDO_STATUS_CODE_BASE) == - COUNT_OF(vdo_status_list)); + ARRAY_SIZE(vdo_status_list)); result = register_error_block("VDO Status", VDO_STATUS_CODE_BASE, @@ -100,28 +84,45 @@ static void do_status_code_registration(void) (result == UDS_SUCCESS) ? VDO_SUCCESS : result; } -/**********************************************************************/ -int register_vdo_status_codes(void) +/** + * vdo_register_status_codes() - Register the VDO status codes if + * needed. + * Return: A success or error code. + */ +int vdo_register_status_codes(void) { perform_once(&vdo_status_codes_registered, do_status_code_registration); return status_code_registration_result; } -/**********************************************************************/ -int map_to_system_error(int error) +/** + * vdo_map_to_system_error() - Given an error code, return a value we + * can return to the OS. + * @error: The error code to convert. + * + * The input error code may be a system-generated value (such as + * -EIO), an errno macro used in our code (such as EIO), or a UDS or + * VDO status code; the result must be something the rest of the OS + * can consume (negative errno values such as -EIO, in the case of the + * kernel). + * + * Return: A system error code value. + */ +int vdo_map_to_system_error(int error) { - char error_name[80], error_message[ERRBUF_SIZE]; + char error_name[UDS_MAX_ERROR_NAME_SIZE]; + char error_message[UDS_MAX_ERROR_MESSAGE_SIZE]; - // 0 is success, negative a system error code + /* 0 is success, negative a system error code */ if (likely(error <= 0)) { return error; } if (error < 1024) { - // errno macro used without negating - may be a minor bug + /* errno macro used without negating - may be a minor bug */ return -error; } - // VDO or UDS error + /* VDO or UDS error */ switch (error) { case VDO_NO_SPACE: return -ENOSPC; @@ -131,9 +132,9 @@ int map_to_system_error(int error) uds_log_info("%s: mapping internal status code %d (%s: %s) to EIO", __func__, error, - string_error_name(error, - error_name, - sizeof(error_name)), + uds_string_error_name(error, + error_name, + sizeof(error_name)), uds_string_error(error, error_message, sizeof(error_message))); diff --git a/vdo/status-codes.h b/vdo/status-codes.h new file mode 100644 index 00000000..189b88dc --- /dev/null +++ b/vdo/status-codes.h @@ -0,0 +1,114 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright Red Hat + */ + +#ifndef STATUS_CODES_H +#define STATUS_CODES_H + +#include "errors.h" + +enum { + UDS_BLOCK_SIZE = UDS_ERROR_CODE_BLOCK_END - UDS_ERROR_CODE_BASE, + VDO_BLOCK_START = UDS_ERROR_CODE_BLOCK_END, + VDO_BLOCK_END = VDO_BLOCK_START + UDS_BLOCK_SIZE, + PRP_BLOCK_START = VDO_BLOCK_END, + PRP_BLOCK_END = PRP_BLOCK_START + UDS_BLOCK_SIZE, +}; + +/* + * VDO-specific status codes. + */ +enum vdo_status_codes { + /* successful result */ + VDO_SUCCESS, + /* base of all VDO errors */ + VDO_STATUS_CODE_BASE = VDO_BLOCK_START, + /* we haven't written this yet */ + VDO_NOT_IMPLEMENTED = VDO_STATUS_CODE_BASE, + /* input out of range */ + VDO_OUT_OF_RANGE, + /* an invalid reference count would result */ + VDO_REF_COUNT_INVALID, + /* a free block could not be allocated */ + VDO_NO_SPACE, + /* unexpected EOF on block read */ + VDO_UNEXPECTED_EOF, + /* improper or missing configuration option */ + VDO_BAD_CONFIGURATION, + /* socket opening or binding problem */ + VDO_SOCKET_ERROR, + /* read or write on non-aligned offset */ + VDO_BAD_ALIGNMENT, + /* prior operation still in progress */ + VDO_COMPONENT_BUSY, + /* page contents incorrect or corrupt data */ + VDO_BAD_PAGE, + /* unsupported version of some component */ + VDO_UNSUPPORTED_VERSION, + /* component id mismatch in decoder */ + VDO_INCORRECT_COMPONENT, + /* parameters have conflicting values */ + VDO_PARAMETER_MISMATCH, + /* the block size is too small */ + VDO_BLOCK_SIZE_TOO_SMALL, + /* no partition exists with a given id */ + VDO_UNKNOWN_PARTITION, + /* a partition already exists with a given id */ + VDO_PARTITION_EXISTS, + /* the VDO is not in read-only mode */ + VDO_NOT_READ_ONLY, + /* physical block growth of too few blocks */ + VDO_INCREMENT_TOO_SMALL, + /* incorrect checksum */ + VDO_CHECKSUM_MISMATCH, + /* the recovery journal is full */ + VDO_RECOVERY_JOURNAL_FULL, + /* a lock is held incorrectly */ + VDO_LOCK_ERROR, + /* the VDO is in read-only mode */ + VDO_READ_ONLY, + /* the VDO is shutting down */ + VDO_SHUTTING_DOWN, + /* the recovery journal has corrupt entries */ + VDO_CORRUPT_JOURNAL, + /* exceeds maximum number of slabs supported */ + VDO_TOO_MANY_SLABS, + /* a compressed block fragment is invalid */ + VDO_INVALID_FRAGMENT, + /* action is unsupported while rebuilding */ + VDO_RETRY_AFTER_REBUILD, + /* the extended command is not known */ + VDO_UNKNOWN_COMMAND, + /* bad extended command parameters */ + VDO_COMMAND_ERROR, + /* cannot determine sizes to fit */ + VDO_CANNOT_DETERMINE_SIZE, + /* a block map entry is invalid */ + VDO_BAD_MAPPING, + /* read cache has no free slots */ + VDO_READ_CACHE_BUSY, + /* bio_add_page failed */ + VDO_BIO_CREATION_FAILED, + /* bad magic number */ + VDO_BAD_MAGIC, + /* bad nonce */ + VDO_BAD_NONCE, + /* sequence number overflow */ + VDO_JOURNAL_OVERFLOW, + /* the VDO is not in a state to perform an admin operation */ + VDO_INVALID_ADMIN_STATE, + /* failure adding a sysfs node */ + VDO_CANT_ADD_SYSFS_NODE, + /* one more than last error code */ + VDO_STATUS_CODE_LAST, + VDO_STATUS_CODE_BLOCK_END = VDO_BLOCK_END +}; + +extern const struct error_info vdo_status_list[]; + +int vdo_register_status_codes(void); + +int vdo_map_to_system_error(int error); + +#endif /* STATUS_CODES_H */ diff --git a/vdo/statusCodes.h b/vdo/statusCodes.h deleted file mode 100644 index 2c191425..00000000 --- a/vdo/statusCodes.h +++ /dev/null @@ -1,146 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/statusCodes.h#12 $ - */ - -#ifndef STATUS_CODES_H -#define STATUS_CODES_H - -#include "errors.h" - -enum { - UDS_BLOCK_SIZE = UDS_ERROR_CODE_BLOCK_END - UDS_ERROR_CODE_BASE, - VDO_BLOCK_START = UDS_ERROR_CODE_BLOCK_END, - VDO_BLOCK_END = VDO_BLOCK_START + UDS_BLOCK_SIZE, - PRP_BLOCK_START = VDO_BLOCK_END, - PRP_BLOCK_END = PRP_BLOCK_START + UDS_BLOCK_SIZE, -}; - -/** - * VDO-specific status codes. - **/ -enum vdo_status_codes { - /** successful result */ - VDO_SUCCESS = 0, - /** base of all VDO errors */ - VDO_STATUS_CODE_BASE = VDO_BLOCK_START, - /** we haven't written this yet */ - VDO_NOT_IMPLEMENTED = VDO_STATUS_CODE_BASE, - /** input out of range */ - VDO_OUT_OF_RANGE, - /** an invalid reference count would result */ - VDO_REF_COUNT_INVALID, - /** a free block could not be allocated */ - VDO_NO_SPACE, - /** unexpected EOF on block read */ - VDO_UNEXPECTED_EOF, - /** improper or missing configuration option */ - VDO_BAD_CONFIGURATION, - /** socket opening or binding problem */ - VDO_SOCKET_ERROR, - /** read or write on non-aligned offset */ - VDO_BAD_ALIGNMENT, - /** prior operation still in progress */ - VDO_COMPONENT_BUSY, - /** page contents incorrect or corrupt data */ - VDO_BAD_PAGE, - /** unsupported version of some component */ - VDO_UNSUPPORTED_VERSION, - /** component id mismatch in decoder */ - VDO_INCORRECT_COMPONENT, - /** parameters have conflicting values */ - VDO_PARAMETER_MISMATCH, - /** the block size is too small */ - VDO_BLOCK_SIZE_TOO_SMALL, - /** no partition exists with a given id */ - VDO_UNKNOWN_PARTITION, - /** a partition already exists with a given id */ - VDO_PARTITION_EXISTS, - /** the VDO is not in read-only mode */ - VDO_NOT_READ_ONLY, - /** physical block growth of too few blocks */ - VDO_INCREMENT_TOO_SMALL, - /** incorrect checksum */ - VDO_CHECKSUM_MISMATCH, - /** the recovery journal is full */ - VDO_RECOVERY_JOURNAL_FULL, - /** a lock is held incorrectly */ - VDO_LOCK_ERROR, - /** the VDO is in read-only mode */ - VDO_READ_ONLY, - /** the VDO is shutting down */ - VDO_SHUTTING_DOWN, - /** the recovery journal has corrupt entries */ - VDO_CORRUPT_JOURNAL, - /** exceeds maximum number of slabs supported */ - VDO_TOO_MANY_SLABS, - /** a compressed block fragment is invalid */ - VDO_INVALID_FRAGMENT, - /** action is unsupported while rebuilding */ - VDO_RETRY_AFTER_REBUILD, - /** the extended command is not known */ - VDO_UNKNOWN_COMMAND, - /** bad extended command parameters */ - VDO_COMMAND_ERROR, - /** cannot determine sizes to fit */ - VDO_CANNOT_DETERMINE_SIZE, - /** a block map entry is invalid */ - VDO_BAD_MAPPING, - /** read cache has no free slots */ - VDO_READ_CACHE_BUSY, - /** bio_add_page failed */ - VDO_BIO_CREATION_FAILED, - /** bad magic number */ - VDO_BAD_MAGIC, - /** bad nonce */ - VDO_BAD_NONCE, - /** sequence number overflow */ - VDO_JOURNAL_OVERFLOW, - /** the VDO is not in a state to perform an admin operation */ - VDO_INVALID_ADMIN_STATE, - /** failure adding a sysfs node */ - VDO_CANT_ADD_SYSFS_NODE, - /** one more than last error code */ - VDO_STATUS_CODE_LAST, - VDO_STATUS_CODE_BLOCK_END = VDO_BLOCK_END -}; - -extern const struct error_info vdo_status_list[]; - -/** - * Register the VDO status codes if needed. - * - * @return a success or error code - **/ -int register_vdo_status_codes(void); - -/** - * Given an error code, return a value we can return to the OS. The - * input error code may be a system-generated value (such as -EIO), an - * errno macro used in our code (such as EIO), or a UDS or VDO status - * code; the result must be something the rest of the OS can consume - * (negative errno values such as -EIO, in the case of the kernel). - * - * @param error the error code to convert - * - * @return a system error code value - **/ -int map_to_system_error(int error); - -#endif // STATUS_CODES_H diff --git a/vdo/string-utils.c b/vdo/string-utils.c new file mode 100644 index 00000000..8a7a077c --- /dev/null +++ b/vdo/string-utils.c @@ -0,0 +1,89 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright Red Hat + */ + +#include "string-utils.h" + +#include "errors.h" +#include "logger.h" +#include "memory-alloc.h" +#include "permassert.h" +#include "uds.h" + +int uds_wrap_vsnprintf(const char *what, + char *buf, + size_t buf_size, + int error, + const char *fmt, + va_list ap, + size_t *needed) +{ + int n; + + if (buf == NULL) { + static char nobuf[1]; + + buf = nobuf; + buf_size = 0; + } + n = vsnprintf(buf, buf_size, fmt, ap); + if (n < 0) { + return uds_log_error_strerror(UDS_UNEXPECTED_RESULT, + "%s: vsnprintf failed", what); + } + if (needed) { + *needed = n; + } + if (((size_t) n >= buf_size) && (buf != NULL) && + (error != UDS_SUCCESS)) { + return uds_log_error_strerror(error, + "%s: string too long", what); + } + return UDS_SUCCESS; +} + +int uds_fixed_sprintf(const char *what, + char *buf, + size_t buf_size, + int error, + const char *fmt, + ...) +{ + va_list args; + int result; + + if (buf == NULL) { + return UDS_INVALID_ARGUMENT; + } + va_start(args, fmt); + result = uds_wrap_vsnprintf(what, buf, buf_size, error, fmt, args, + NULL); + va_end(args); + return result; +} + +char *uds_v_append_to_buffer(char *buffer, char *buf_end, const char *fmt, + va_list args) +{ + size_t n = vsnprintf(buffer, buf_end - buffer, fmt, args); + + if (n >= (size_t)(buf_end - buffer)) { + buffer = buf_end; + } else { + buffer += n; + } + return buffer; +} + +char *uds_append_to_buffer(char *buffer, char *buf_end, const char *fmt, ...) +{ + va_list ap; + char *pos; + + va_start(ap, fmt); + pos = uds_v_append_to_buffer(buffer, buf_end, fmt, ap); + va_end(ap); + return pos; +} + diff --git a/uds/stringUtils.h b/vdo/string-utils.h similarity index 50% rename from uds/stringUtils.h rename to vdo/string-utils.h index f863c1d7..c3aed67a 100644 --- a/uds/stringUtils.h +++ b/vdo/string-utils.h @@ -1,22 +1,6 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/krusty/src/uds/stringUtils.h#15 $ */ #ifndef STRING_UTILS_H @@ -26,7 +10,7 @@ #include #include "compiler.h" -#include "typeDefs.h" +#include "type-defs.h" /** * Convert a boolean value to its corresponding "true" or "false" string. @@ -40,21 +24,6 @@ static INLINE const char *uds_bool_to_string(bool value) return (value ? "true" : "false"); } -/** - * Allocate a string built according to format (our version of asprintf). - * - * @param [in] what A description of what is being allocated, for error - * logging; if NULL doesn't log anything. - * @param [out] strp The pointer in which to store the allocated string. - * @param [in] fmt The sprintf format parameter. - * - * @return UDS_SUCCESS, or the appropriately translated asprintf error - **/ -int __must_check uds_alloc_sprintf(const char *what, - char **strp, - const char *fmt, ...) - __printf(3, 4); - /** * Write a printf-style string into a fixed-size buffer, returning * errors if it would not fit. (our version of snprintf) @@ -136,70 +105,5 @@ char *uds_v_append_to_buffer(char *buffer, char *buf_end, const char *fmt, va_list args) __printf(3, 0); -/** - * Our version of strtok_r, since some platforma apparently don't define it. - * - * @param str On first call, the string to tokenize. On subsequent - * calls, NULL. - * @param delims The set of delimiter characters. - * @param state_ptr The address of a variable which holds the state of - * the tokenization between calls to uds_next_token. - * - * @return the next token if any, or NULL - **/ -char *uds_next_token(char *str, const char *delims, char **state_ptr); - -/** - * Parse a string representing a decimal uint64_t. - * - * @param str The string. - * @param num Where to put the number. - * - * @return UDS_SUCCESS or the error UDS_INVALID_ARGUMENT if the string - * is not in the correct format. - **/ -int __must_check uds_parse_uint64(const char *str, uint64_t *num); - -/** - * Attempt to convert a string to an integer (base 10) - * - * @param nptr Pointer to string to convert - * @param num The resulting integer - * - * @return UDS_SUCCESS or an error code - **/ -int __must_check uds_string_to_signed_int(const char *nptr, int *num); - -/** - * Attempt to convert a string to a long integer (base 10) - * - * @param nptr Pointer to string to convert - * @param num The resulting long integer - * - * @return UDS_SUCCESS or an error code - **/ -int __must_check uds_string_to_signed_long(const char *nptr, long *num); - -/** - * Attempt to convert a string to an unsigned integer (base 10). - * - * @param nptr Pointer to string to convert - * @param num The resulting unsigned integer - * - * @return UDS_SUCCESS or an error code - **/ -int __must_check -uds_string_to_unsigned_int(const char *nptr, unsigned int *num); - -/** - * Attempt to convert a string to an unsigned long integer (base 10). - * - * @param nptr Pointer to string to convert - * @param num The resulting long unsigned integer - * - * @return UDS_SUCCESS or an error code - **/ -int __must_check -uds_string_to_unsigned_long(const char *nptr, unsigned long *num); #endif /* STRING_UTILS_H */ diff --git a/vdo/super-block-codec.c b/vdo/super-block-codec.c new file mode 100644 index 00000000..a4d54c61 --- /dev/null +++ b/vdo/super-block-codec.c @@ -0,0 +1,201 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright Red Hat + */ + +#include "super-block-codec.h" + +#include "buffer.h" +#include "logger.h" +#include "memory-alloc.h" +#include "permassert.h" + +#include "constants.h" +#include "header.h" +#include "status-codes.h" +#include "vdo.h" + +enum { + SUPER_BLOCK_FIXED_SIZE = VDO_ENCODED_HEADER_SIZE + sizeof(uint32_t), + MAX_COMPONENT_DATA_SIZE = VDO_SECTOR_SIZE - SUPER_BLOCK_FIXED_SIZE, +}; + +static const struct header SUPER_BLOCK_HEADER_12_0 = { + .id = VDO_SUPER_BLOCK, + .version = { + .major_version = 12, + .minor_version = 0, + }, + + /* This is the minimum size, if the super block contains no components. */ + .size = SUPER_BLOCK_FIXED_SIZE - VDO_ENCODED_HEADER_SIZE, +}; + +/** + * vdo_initialize_super_block_codec() - Initialize a super block codec. + * @codec: The codec to initialize. + * + * Return: VDO_SUCCESS or an error. + */ +int vdo_initialize_super_block_codec(struct super_block_codec *codec) +{ + int result = make_buffer(MAX_COMPONENT_DATA_SIZE, + &codec->component_buffer); + if (result != UDS_SUCCESS) { + return result; + } + + result = UDS_ALLOCATE(VDO_BLOCK_SIZE, char, "encoded super block", + (char **) &codec->encoded_super_block); + if (result != UDS_SUCCESS) { + return result; + } + + /* + * Even though the buffer is a full block, to avoid the potential + * corruption from a torn write, the entire encoding must fit in the + * first sector. + */ + return wrap_buffer(codec->encoded_super_block, + VDO_SECTOR_SIZE, + 0, + &codec->block_buffer); +} + +/** + * vdo_destroy_super_block_codec() - Free resources in a super block codec. + * @codec: The codec to clean up. + */ +void vdo_destroy_super_block_codec(struct super_block_codec *codec) +{ + free_buffer(UDS_FORGET(codec->block_buffer)); + free_buffer(UDS_FORGET(codec->component_buffer)); + UDS_FREE(codec->encoded_super_block); +} + +/** + * vdo_encode_super_block() - Encode a super block into its on-disk + * representation. + * @codec: The super block codec. + * + * Return: VDO_SUCCESS or an error. + */ +int vdo_encode_super_block(struct super_block_codec *codec) +{ + size_t component_data_size; + uint32_t checksum; + struct header header = SUPER_BLOCK_HEADER_12_0; + struct buffer *buffer = codec->block_buffer; + int result = reset_buffer_end(buffer, 0); + + if (result != VDO_SUCCESS) { + return result; + } + + component_data_size = content_length(codec->component_buffer); + + /* Encode the header. */ + header.size += component_data_size; + result = vdo_encode_header(&header, buffer); + if (result != UDS_SUCCESS) { + return result; + } + + /* Copy the already-encoded component data. */ + result = put_bytes(buffer, component_data_size, + get_buffer_contents(codec->component_buffer)); + if (result != UDS_SUCCESS) { + return result; + } + + /* Compute and encode the checksum. */ + checksum = vdo_crc32(codec->encoded_super_block, + content_length(buffer)); + result = put_uint32_le_into_buffer(buffer, checksum); + if (result != UDS_SUCCESS) { + return result; + } + + return UDS_SUCCESS; +} + +/** + * vdo_decode_super_block() - Decode a super block from its on-disk + * representation. + * @codec: The super block to decode. + * + * Return: VDO_SUCCESS or an error. + */ +int vdo_decode_super_block(struct super_block_codec *codec) +{ + struct header header; + int result; + size_t component_data_size; + uint32_t checksum, saved_checksum; + + /* Reset the block buffer to start decoding the entire first sector. */ + struct buffer *buffer = codec->block_buffer; + + clear_buffer(buffer); + + /* Decode and validate the header. */ + result = vdo_decode_header(buffer, &header); + if (result != VDO_SUCCESS) { + return result; + } + + result = vdo_validate_header(&SUPER_BLOCK_HEADER_12_0, &header, false, + __func__); + if (result != VDO_SUCCESS) { + return result; + } + + if (header.size > content_length(buffer)) { + /* + * We can't check release version or checksum until we know the + * content size, so we have to assume a version mismatch on + * unexpected values. + */ + return uds_log_error_strerror(VDO_UNSUPPORTED_VERSION, + "super block contents too large: %zu", + header.size); + } + + /* Restrict the buffer to the actual payload bytes that remain. */ + result = + reset_buffer_end(buffer, uncompacted_amount(buffer) + header.size); + if (result != VDO_SUCCESS) { + return result; + } + + /* The component data is all the rest, except for the checksum. */ + component_data_size = content_length(buffer) - sizeof(uint32_t); + result = put_buffer(codec->component_buffer, buffer, + component_data_size); + if (result != VDO_SUCCESS) { + return result; + } + + /* + * Checksum everything up to but not including the saved checksum + * itself. + */ + checksum = vdo_crc32(codec->encoded_super_block, + uncompacted_amount(buffer)); + + /* Decode and verify the saved checksum. */ + result = get_uint32_le_from_buffer(buffer, &saved_checksum); + if (result != VDO_SUCCESS) { + return result; + } + + result = ASSERT(content_length(buffer) == 0, + "must have decoded entire superblock payload"); + if (result != VDO_SUCCESS) { + return result; + } + + return ((checksum != saved_checksum) ? VDO_CHECKSUM_MISMATCH + : VDO_SUCCESS); +} + diff --git a/vdo/super-block-codec.h b/vdo/super-block-codec.h new file mode 100644 index 00000000..68857cd4 --- /dev/null +++ b/vdo/super-block-codec.h @@ -0,0 +1,38 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright Red Hat + */ + +#ifndef SUPER_BLOCK_CODEC_H +#define SUPER_BLOCK_CODEC_H + +#include "buffer.h" + +#include "header.h" +#include "types.h" + +/* + * The machinery for encoding and decoding super blocks. + */ +struct super_block_codec { + /* The buffer for encoding and decoding component data */ + struct buffer *component_buffer; + /* + * A sector-sized buffer wrapping the first sector of + * encoded_super_block, for encoding and decoding the entire super + * block. + */ + struct buffer *block_buffer; + /* A 1-block buffer holding the encoded on-disk super block */ + byte *encoded_super_block; +}; + +int __must_check vdo_initialize_super_block_codec(struct super_block_codec *codec); + +void vdo_destroy_super_block_codec(struct super_block_codec *codec); + +int __must_check vdo_encode_super_block(struct super_block_codec *codec); + +int __must_check vdo_decode_super_block(struct super_block_codec *codec); + +#endif /* SUPER_BLOCK_CODEC_H */ diff --git a/vdo/super-block.c b/vdo/super-block.c new file mode 100644 index 00000000..dbca0d20 --- /dev/null +++ b/vdo/super-block.c @@ -0,0 +1,260 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright Red Hat + */ + +#include "super-block.h" + +#include + +#include "logger.h" +#include "memory-alloc.h" +#include "permassert.h" + +#include "completion.h" +#include "io-submitter.h" +#include "kernel-types.h" +#include "status-codes.h" +#include "super-block-codec.h" +#include "types.h" +#include "vio.h" + +struct vdo_super_block { + /* The parent for asynchronous load and save operations */ + struct vdo_completion *parent; + /* The vio for reading and writing the super block to disk */ + struct vio *vio; + /* The super block codec */ + struct super_block_codec codec; + /* Whether this super block may not be written */ + bool unwriteable; +}; + +/** + * allocate_super_block() - Allocate a super block. + * @vdo: The vdo containing the super block on disk. + * @super_block_ptr: A pointer to hold the new super block. + * + * Callers must free the allocated super block even on error. + * + * Return: VDO_SUCCESS or an error. + */ +static int __must_check +allocate_super_block(struct vdo *vdo, + struct vdo_super_block **super_block_ptr) +{ + struct vdo_super_block *super_block; + char *buffer; + int result = UDS_ALLOCATE(1, struct vdo_super_block, __func__, + super_block_ptr); + if (result != UDS_SUCCESS) { + return result; + } + + super_block = *super_block_ptr; + result = vdo_initialize_super_block_codec(&super_block->codec); + if (result != UDS_SUCCESS) { + return result; + } + + buffer = (char *) super_block->codec.encoded_super_block; + return create_metadata_vio(vdo, + VIO_TYPE_SUPER_BLOCK, + VIO_PRIORITY_METADATA, + super_block, + buffer, + &super_block->vio); +} + + +/** + * vdo_free_super_block() - Free a super block. + * @super_block: The super block to free. + */ +void vdo_free_super_block(struct vdo_super_block *super_block) +{ + if (super_block == NULL) { + return; + } + + free_vio(UDS_FORGET(super_block->vio)); + vdo_destroy_super_block_codec(&super_block->codec); + UDS_FREE(super_block); +} + +/** + * finish_super_block_parent() - Finish the parent of a super block + * load or save operation. + * @completion: The super block vio. + * + * This callback is registered in vdo_save_super_block() and + * vdo_load_super_block(). + */ +static void finish_super_block_parent(struct vdo_completion *completion) +{ + struct vdo_super_block *super_block = completion->parent; + struct vdo_completion *parent = super_block->parent; + + super_block->parent = NULL; + vdo_finish_completion(parent, completion->result); +} + +/** + * handle_save_error() - Log a super block save error. + * @completion: The super block vio. + * + * This error handler is registered in vdo_save_super_block(). + */ +static void handle_save_error(struct vdo_completion *completion) +{ + struct vdo_super_block *super_block = completion->parent; + + record_metadata_io_error(as_vio(completion)); + uds_log_error_strerror(completion->result, "super block save failed"); + /* + * Mark the super block as unwritable so that we won't attempt to write + * it again. This avoids the case where a growth attempt fails writing + * the super block with the new size, but the subsequent attempt to + * write out the read-only state succeeds. In this case, writes which + * happened just before the suspend would not be visible if the VDO is + * restarted without rebuilding, but, after a read-only rebuild, the + * effects of those writes would reappear. + */ + super_block->unwriteable = true; + completion->callback(completion); +} + +static void super_block_write_endio(struct bio *bio) +{ + struct vio *vio = bio->bi_private; + struct vdo_super_block *super_block = vio_as_completion(vio)->parent; + struct vdo_completion *parent = super_block->parent; + + continue_vio_after_io(vio, + finish_super_block_parent, + parent->callback_thread_id); +} + +/** + * vdo_save_super_block() - Save a super block. + * @super_block: The super block to save. + * @super_block_offset: The location at which to write the super block. + * @parent: The object to notify when the save is complete. + */ +void vdo_save_super_block(struct vdo_super_block *super_block, + physical_block_number_t super_block_offset, + struct vdo_completion *parent) +{ + int result; + + if (super_block->unwriteable) { + vdo_finish_completion(parent, VDO_READ_ONLY); + return; + } + + if (super_block->parent != NULL) { + vdo_finish_completion(parent, VDO_COMPONENT_BUSY); + return; + } + + result = vdo_encode_super_block(&super_block->codec); + if (result != VDO_SUCCESS) { + vdo_finish_completion(parent, result); + return; + } + + super_block->parent = parent; + super_block->vio->completion.callback_thread_id = + parent->callback_thread_id; + submit_metadata_vio(super_block->vio, + super_block_offset, + super_block_write_endio, + handle_save_error, + REQ_OP_WRITE | REQ_PREFLUSH | REQ_FUA); +} + +/** + * finish_reading_super_block() - Continue after loading the super block. + * @completion: The super block vio. + * + * This callback is registered in vdo_load_super_block(). + */ +static void finish_reading_super_block(struct vdo_completion *completion) +{ + struct vdo_super_block *super_block = completion->parent; + struct vdo_completion *parent = super_block->parent; + + super_block->parent = NULL; + vdo_finish_completion(parent, + vdo_decode_super_block(&super_block->codec)); +} + +/** + * handle_super_block_read_error() - Handle an error reading the super block. + * @completion: The super block vio. + * + * This error handler is registered in vdo_load_super_block(). + */ +static void handle_super_block_read_error(struct vdo_completion *completion) +{ + record_metadata_io_error(as_vio(completion)); + finish_reading_super_block(completion); +} + +static void read_super_block_endio(struct bio *bio) +{ + struct vio *vio = bio->bi_private; + struct vdo_super_block *super_block = vio_as_completion(vio)->parent; + struct vdo_completion *parent = super_block->parent; + + continue_vio_after_io(vio, + finish_reading_super_block, + parent->callback_thread_id); +} + +/** + * vdo_load_super_block() - Allocate a super block and read its contents from + * storage. + * @vdo: The vdo containing the super block on disk. + * @parent: The completion to finish after loading the super block. + * @super_block_offset: The location from which to read the super block. + * @super_block_ptr: A pointer to hold the super block. + * + * If a load error occurs before the super block's own completion can be + * allocated, the parent will be finished with the error. + */ +void vdo_load_super_block(struct vdo *vdo, + struct vdo_completion *parent, + physical_block_number_t super_block_offset, + struct vdo_super_block **super_block_ptr) +{ + struct vdo_super_block *super_block = NULL; + int result = allocate_super_block(vdo, &super_block); + + if (result != VDO_SUCCESS) { + vdo_free_super_block(super_block); + vdo_finish_completion(parent, result); + return; + } + + *super_block_ptr = super_block; + + super_block->parent = parent; + submit_metadata_vio(super_block->vio, + super_block_offset, + read_super_block_endio, + handle_super_block_read_error, + REQ_OP_READ); +} + +/** + * vdo_get_super_block_codec() - Get the super block codec from a super block. + * @super_block: The super block from which to get the component data. + * + * Return: The codec. + */ +struct super_block_codec * +vdo_get_super_block_codec(struct vdo_super_block *super_block) +{ + return &super_block->codec; +} diff --git a/vdo/super-block.h b/vdo/super-block.h new file mode 100644 index 00000000..021c3149 --- /dev/null +++ b/vdo/super-block.h @@ -0,0 +1,28 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright Red Hat + */ + +#ifndef SUPER_BLOCK_H +#define SUPER_BLOCK_H + +#include "kernel-types.h" +#include "types.h" + +struct vdo_super_block; + +void vdo_free_super_block(struct vdo_super_block *super_block); + +void vdo_save_super_block(struct vdo_super_block *super_block, + physical_block_number_t super_block_offset, + struct vdo_completion *parent); + +void vdo_load_super_block(struct vdo *vdo, + struct vdo_completion *parent, + physical_block_number_t super_block_offset, + struct vdo_super_block **super_block_ptr); + +struct super_block_codec * __must_check +vdo_get_super_block_codec(struct vdo_super_block *super_block); + +#endif /* SUPER_BLOCK_H */ diff --git a/vdo/superBlock.c b/vdo/superBlock.c deleted file mode 100644 index 0a8e6223..00000000 --- a/vdo/superBlock.c +++ /dev/null @@ -1,223 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/superBlock.c#19 $ - */ - -#include "superBlock.h" - -#include "logger.h" -#include "memoryAlloc.h" -#include "permassert.h" - -#include "completion.h" -#include "statusCodes.h" -#include "superBlockCodec.h" -#include "types.h" -#include "vio.h" - -struct vdo_super_block { - /** The parent for asynchronous load and save operations */ - struct vdo_completion *parent; - /** The vio for reading and writing the super block to disk */ - struct vio *vio; - /** The super block codec */ - struct super_block_codec codec; - /** Whether this super block may not be written */ - bool unwriteable; -}; - -/** - * Allocate a super block. Callers must free the allocated super block even - * on error. - * - * @param [in] vdo The vdo containing the super block on disk - * @param [out] super_block_ptr A pointer to hold the new super block - * - * @return VDO_SUCCESS or an error - **/ -static int __must_check -allocate_super_block(struct vdo *vdo, - struct vdo_super_block **super_block_ptr) -{ - struct vdo_super_block *super_block; - char *buffer; - int result = UDS_ALLOCATE(1, struct vdo_super_block, __func__, - super_block_ptr); - if (result != UDS_SUCCESS) { - return result; - } - - super_block = *super_block_ptr; - result = initialize_vdo_super_block_codec(&super_block->codec); - if (result != UDS_SUCCESS) { - return result; - } - - buffer = (char *) super_block->codec.encoded_super_block; - return create_metadata_vio(vdo, - VIO_TYPE_SUPER_BLOCK, - VIO_PRIORITY_METADATA, - super_block, - buffer, - &super_block->vio); -} - -/**********************************************************************/ -int make_vdo_super_block(struct vdo *vdo, - struct vdo_super_block **super_block_ptr) -{ - struct vdo_super_block *super_block; - int result = allocate_super_block(vdo, &super_block); - if (result != VDO_SUCCESS) { - free_vdo_super_block(super_block); - return result; - } - - *super_block_ptr = super_block; - return VDO_SUCCESS; -} - -/**********************************************************************/ -void free_vdo_super_block(struct vdo_super_block *super_block) -{ - if (super_block == NULL) { - return; - } - - free_vio(UDS_FORGET(super_block->vio)); - destroy_vdo_super_block_codec(&super_block->codec); - UDS_FREE(super_block); -} - -/** - * Finish the parent of a super block load or save operation. This - * callback is registered in save_vdo_super_block() and - * load_vdo_super_block(). - * - * @param completion The super block vio - **/ -static void finish_super_block_parent(struct vdo_completion *completion) -{ - struct vdo_super_block *super_block = completion->parent; - struct vdo_completion *parent = super_block->parent; - super_block->parent = NULL; - finish_vdo_completion(parent, completion->result); -} - -/** - * Log a super block save error. This error handler is registered in - * save_vdo_super_block(). - * - * @param completion The super block vio - **/ -static void handle_save_error(struct vdo_completion *completion) -{ - uds_log_error_strerror(completion->result, "super block save failed"); - /* - * Mark the super block as unwritable so that we won't attempt to write - * it again. This avoids the case where a growth attempt fails writing - * the super block with the new size, but the subsequent attempt to - * write out the read-only state succeeds. In this case, writes which - * happened just before the suspend would not be visible if the VDO is - * restarted without rebuilding, but, after a read-only rebuild, the - * effects of those writes would reappear. - */ - ((struct vdo_super_block *) completion->parent)->unwriteable = true; - completion->callback(completion); -} - -/**********************************************************************/ -void save_vdo_super_block(struct vdo_super_block *super_block, - physical_block_number_t super_block_offset, - struct vdo_completion *parent) -{ - int result; - - if (super_block->unwriteable) { - finish_vdo_completion(parent, VDO_READ_ONLY); - return; - } - - if (super_block->parent != NULL) { - finish_vdo_completion(parent, VDO_COMPONENT_BUSY); - return; - } - - result = encode_vdo_super_block(&super_block->codec); - if (result != VDO_SUCCESS) { - finish_vdo_completion(parent, result); - return; - } - - super_block->parent = parent; - super_block->vio->completion.callback_thread_id = - parent->callback_thread_id; - launch_write_metadata_vio_with_flush(super_block->vio, - super_block_offset, - finish_super_block_parent, - handle_save_error, - true, true); -} - -/** - * Continue after loading the super block. This callback is registered - * in load_vdo_super_block(). - * - * @param completion The super block vio - **/ -static void finish_reading_super_block(struct vdo_completion *completion) -{ - struct vdo_super_block *super_block = completion->parent; - struct vdo_completion *parent = super_block->parent; - super_block->parent = NULL; - finish_vdo_completion(parent, - decode_vdo_super_block(&super_block->codec)); -} - -/**********************************************************************/ -void load_vdo_super_block(struct vdo *vdo, - struct vdo_completion *parent, - physical_block_number_t super_block_offset, - struct vdo_super_block **super_block_ptr) -{ - struct vdo_super_block *super_block = NULL; - int result = allocate_super_block(vdo, &super_block); - if (result != VDO_SUCCESS) { - free_vdo_super_block(super_block); - finish_vdo_completion(parent, result); - return; - } - - *super_block_ptr = super_block; - - super_block->parent = parent; - super_block->vio->completion.callback_thread_id = - parent->callback_thread_id; - launch_read_metadata_vio(super_block->vio, - super_block_offset, - finish_reading_super_block, - finish_super_block_parent); -} - -/**********************************************************************/ -struct super_block_codec * -get_vdo_super_block_codec(struct vdo_super_block *super_block) -{ - return &super_block->codec; -} diff --git a/vdo/superBlock.h b/vdo/superBlock.h deleted file mode 100644 index 83c99099..00000000 --- a/vdo/superBlock.h +++ /dev/null @@ -1,85 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/superBlock.h#10 $ - */ - -#ifndef SUPER_BLOCK_H -#define SUPER_BLOCK_H - -#include "types.h" - -struct vdo_super_block; - -/** - * Make a new super block. - * - * @param [in] vdo The vdo containing the super block on disk - * @param [out] super_block_ptr A pointer to hold the new super block - * - * @return VDO_SUCCESS or an error - **/ -int __must_check make_vdo_super_block(struct vdo *vdo, - struct vdo_super_block **super_block_ptr); - -/** - * Free a super block. - * - * @param super_block The super block to free - **/ -void free_vdo_super_block(struct vdo_super_block *super_block); - -/** - * Save a super block. - * - * @param super_block The super block to save - * @param super_block_offset The location at which to write the super block - * @param parent The object to notify when the save is complete - **/ -void save_vdo_super_block(struct vdo_super_block *super_block, - physical_block_number_t super_block_offset, - struct vdo_completion *parent); - -/** - * Allocate a super block and read its contents from storage. If a load error - * occurs before the super block's own completion can be allocated, the parent - * will be finished with the error. - * - * @param [in] vdo The vdo containing the super block on disk - * @param [in] parent The completion to finish after loading the - * super block - * @param [in] super_block_offset The location from which to read the super - * block - * @param [out] super_block_ptr A pointer to hold the super block - **/ -void load_vdo_super_block(struct vdo *vdo, - struct vdo_completion *parent, - physical_block_number_t super_block_offset, - struct vdo_super_block **super_block_ptr); - -/** - * Get the super block codec from a super block. - * - * @param super_block The super block from which to get the component data - * - * @return the codec - **/ -struct super_block_codec * __must_check -get_vdo_super_block_codec(struct vdo_super_block *super_block); - -#endif /* SUPER_BLOCK_H */ diff --git a/vdo/superBlockCodec.c b/vdo/superBlockCodec.c deleted file mode 100644 index df91d0f0..00000000 --- a/vdo/superBlockCodec.c +++ /dev/null @@ -1,198 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/superBlockCodec.c#20 $ - */ - -#include "superBlockCodec.h" - -#include "buffer.h" -#include "logger.h" -#include "memoryAlloc.h" -#include "permassert.h" - -#include "checksum.h" -#include "constants.h" -#include "header.h" -#include "statusCodes.h" - -enum { - SUPER_BLOCK_FIXED_SIZE = VDO_ENCODED_HEADER_SIZE + VDO_CHECKSUM_SIZE, - MAX_COMPONENT_DATA_SIZE = VDO_SECTOR_SIZE - SUPER_BLOCK_FIXED_SIZE, -}; - -static const struct header SUPER_BLOCK_HEADER_12_0 = { - .id = VDO_SUPER_BLOCK, - .version = - { - .major_version = 12, - .minor_version = 0, - }, - - // This is the minimum size, if the super block contains no components. - .size = SUPER_BLOCK_FIXED_SIZE - VDO_ENCODED_HEADER_SIZE, -}; - -/**********************************************************************/ -int initialize_vdo_super_block_codec(struct super_block_codec *codec) -{ - int result = make_buffer(MAX_COMPONENT_DATA_SIZE, - &codec->component_buffer); - if (result != UDS_SUCCESS) { - return result; - } - - result = UDS_ALLOCATE(VDO_BLOCK_SIZE, char, "encoded super block", - (char **) &codec->encoded_super_block); - if (result != UDS_SUCCESS) { - return result; - } - - // Even though the buffer is a full block, to avoid the potential - // corruption from a torn write, the entire encoding must fit in the - // first sector. - return wrap_buffer(codec->encoded_super_block, - VDO_SECTOR_SIZE, - 0, - &codec->block_buffer); -} - -/**********************************************************************/ -void destroy_vdo_super_block_codec(struct super_block_codec *codec) -{ - free_buffer(UDS_FORGET(codec->block_buffer)); - free_buffer(UDS_FORGET(codec->component_buffer)); - UDS_FREE(codec->encoded_super_block); -} - -/**********************************************************************/ -int encode_vdo_super_block(struct super_block_codec *codec) -{ - size_t component_data_size; - struct header header = SUPER_BLOCK_HEADER_12_0; - crc32_checksum_t checksum; - struct buffer *buffer = codec->block_buffer; - int result = reset_buffer_end(buffer, 0); - if (result != VDO_SUCCESS) { - return result; - } - - component_data_size = content_length(codec->component_buffer); - - // Encode the header. - header.size += component_data_size; - result = encode_vdo_header(&header, buffer); - if (result != UDS_SUCCESS) { - return result; - } - - // Copy the already-encoded component data. - result = put_bytes(buffer, component_data_size, - get_buffer_contents(codec->component_buffer)); - if (result != UDS_SUCCESS) { - return result; - } - - // Compute and encode the checksum. - checksum = vdo_update_crc32(VDO_INITIAL_CHECKSUM, - codec->encoded_super_block, - content_length(buffer)); - result = put_uint32_le_into_buffer(buffer, checksum); - if (result != UDS_SUCCESS) { - return result; - } - - return UDS_SUCCESS; -} - -/**********************************************************************/ -int decode_vdo_super_block(struct super_block_codec *codec) -{ - struct header header; - int result; - size_t component_data_size; - crc32_checksum_t checksum, saved_checksum; - - // Reset the block buffer to start decoding the entire first sector. - struct buffer *buffer = codec->block_buffer; - clear_buffer(buffer); - - // Decode and validate the header. - result = decode_vdo_header(buffer, &header); - if (result != VDO_SUCCESS) { - return result; - } - - result = validate_vdo_header(&SUPER_BLOCK_HEADER_12_0, &header, false, - __func__); - if (result != VDO_SUCCESS) { - return result; - } - - if (header.size > content_length(buffer)) { - // We can't check release version or checksum until we know the - // content size, so we have to assume a version mismatch on - // unexpected values. - return uds_log_error_strerror(VDO_UNSUPPORTED_VERSION, - "super block contents too large: %zu", - header.size); - } - - // Restrict the buffer to the actual payload bytes that remain. - result = - reset_buffer_end(buffer, uncompacted_amount(buffer) + header.size); - if (result != VDO_SUCCESS) { - return result; - } - - // The component data is all the rest, except for the checksum. - component_data_size = - content_length(buffer) - sizeof(crc32_checksum_t); - result = put_buffer(codec->component_buffer, buffer, - component_data_size); - if (result != VDO_SUCCESS) { - return result; - } - - // Checksum everything up to but not including the saved checksum - // itself. - checksum = vdo_update_crc32(VDO_INITIAL_CHECKSUM, - codec->encoded_super_block, - uncompacted_amount(buffer)); - - // Decode and verify the saved checksum. - result = get_uint32_le_from_buffer(buffer, &saved_checksum); - if (result != VDO_SUCCESS) { - return result; - } - - result = ASSERT(content_length(buffer) == 0, - "must have decoded entire superblock payload"); - if (result != VDO_SUCCESS) { - return result; - } - - return ((checksum != saved_checksum) ? VDO_CHECKSUM_MISMATCH - : VDO_SUCCESS); -} - -/**********************************************************************/ -size_t get_vdo_super_block_fixed_size(void) -{ - return SUPER_BLOCK_FIXED_SIZE; -} diff --git a/vdo/superBlockCodec.h b/vdo/superBlockCodec.h deleted file mode 100644 index cbdc0132..00000000 --- a/vdo/superBlockCodec.h +++ /dev/null @@ -1,88 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/superBlockCodec.h#8 $ - */ - -#ifndef SUPER_BLOCK_CODEC_H -#define SUPER_BLOCK_CODEC_H - -#include "buffer.h" - -#include "header.h" -#include "types.h" - -/* - * The machinery for encoding and decoding super blocks. - */ -struct super_block_codec { - /** The buffer for encoding and decoding component data */ - struct buffer *component_buffer; - /** - * A sector-sized buffer wrapping the first sector of - * encoded_super_block, for encoding and decoding the entire super - * block. - **/ - struct buffer *block_buffer; - /** A 1-block buffer holding the encoded on-disk super block */ - byte *encoded_super_block; -}; - -/** - * Initialize a super block codec. - * - * @param codec The codec to initialize - * - * @return VDO_SUCCESS or an error - **/ -int __must_check initialize_vdo_super_block_codec(struct super_block_codec *codec); - -/** - * Free resources in a super block codec. - * - * @param codec The codec to clean up - **/ -void destroy_vdo_super_block_codec(struct super_block_codec *codec); - -/** - * Encode a super block into its on-disk representation. - * - * @param codec The super block codec - * - * @return VDO_SUCCESS or an error - **/ -int __must_check encode_vdo_super_block(struct super_block_codec *codec); - -/** - * Decode a super block from its on-disk representation. - * - * @param codec The super block to decode - * - * @return VDO_SUCCESS or an error - **/ -int __must_check decode_vdo_super_block(struct super_block_codec *codec); - -/** - * Get the encoded size of the fixed (non-component data) portion of a super - * block (this is for unit testing). - * - * @return The encoded size of the fixed portion of the super block - **/ -size_t __must_check get_vdo_super_block_fixed_size(void); - -#endif // SUPER_BLOCK_CODEC_H diff --git a/vdo/sync-completion.c b/vdo/sync-completion.c new file mode 100644 index 00000000..da2e6f16 --- /dev/null +++ b/vdo/sync-completion.c @@ -0,0 +1,80 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright Red Hat + */ + +#include "sync-completion.h" + +#include + +#include "completion.h" + +struct sync_completion { + struct vdo_completion vdo_completion; + struct completion completion; + vdo_action *action; +}; + +/** + * as_sync_completion() - Convert a vdo_completion to a sync completion. + * @completion: The completion to convert. + * + * Return: The completion as a sync completion. + */ +static inline struct sync_completion * __must_check +as_sync_completion(struct vdo_completion *completion) +{ + vdo_assert_completion_type(completion->type, VDO_SYNC_COMPLETION); + return container_of(completion, + struct sync_completion, + vdo_completion); +} + +/** + * complete_synchronous_action() - A vdo_action to signal the waiting thread + * that a synchronous action is complete. + * @completion: The sync completion. + * + * This callback is registered in run_synchrnous_action(). + */ +static void complete_synchronous_action(struct vdo_completion *completion) +{ + complete(&(as_sync_completion(completion)->completion)); +} + +/** + * run_synchronous_action() - A vdo_action to perform a synchronous action + * registered in vdo_perform_synchronous_action(). + * @completion: The sync completion. + */ +static void run_synchronous_action(struct vdo_completion *completion) +{ + completion->callback = complete_synchronous_action; + as_sync_completion(completion)->action(completion); +} + +/** + * vdo_perform_synchronous_action() - Launch an action on a VDO thread and + * wait for it to complete. + * @vdo: The vdo. + * @action: The callback to launch. + * @thread_id: The thread on which to run the action. + * @parent: The parent of the sync completion (may be NULL). + */ +int vdo_perform_synchronous_action(struct vdo *vdo, + vdo_action *action, + thread_id_t thread_id, + void *parent) +{ + struct sync_completion sync; + + vdo_initialize_completion(&sync.vdo_completion, vdo, VDO_SYNC_COMPLETION); + init_completion(&sync.completion); + sync.action = action; + vdo_launch_completion_callback_with_parent(&sync.vdo_completion, + run_synchronous_action, + thread_id, + parent); + wait_for_completion(&sync.completion); + return sync.vdo_completion.result; +} diff --git a/vdo/sync-completion.h b/vdo/sync-completion.h new file mode 100644 index 00000000..c8a52a42 --- /dev/null +++ b/vdo/sync-completion.h @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright Red Hat + */ + +#ifndef SYNC_COMPLETION_H +#define SYNC_COMPLETION_H + +#include "completion.h" +#include "types.h" + +int vdo_perform_synchronous_action(struct vdo *vdo, + vdo_action * action, + thread_id_t thread_id, + void *parent); + +#endif /* SYNC_COMPLETION_H */ diff --git a/vdo/syncCompletion.c b/vdo/syncCompletion.c deleted file mode 100644 index c6e33ef7..00000000 --- a/vdo/syncCompletion.c +++ /dev/null @@ -1,90 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/syncCompletion.c#8 $ - */ - -#include "syncCompletion.h" - -#include - -#include "completion.h" - -struct sync_completion { - struct vdo_completion vdo_completion; - struct completion completion; - vdo_action *action; -}; - -/** - * Convert a vdo_completion to a sync completion. - * - * @param completion The completion to convert - * - * @return The completion as a sync completion. - **/ -static inline struct sync_completion * __must_check -as_sync_completion(struct vdo_completion *completion) -{ - assert_vdo_completion_type(completion->type, VDO_SYNC_COMPLETION); - return container_of(completion, - struct sync_completion, - vdo_completion); -} - -/** - * A vdo_action to signal the waiting thread that a synchronous action is - * complete. This callback is registered in run_synchrnous_action(). - * - * @param completion The sync completion - **/ -static void complete_synchronous_action(struct vdo_completion *completion) -{ - complete(&(as_sync_completion(completion)->completion)); -} - -/** - * A vdo_action to perform a synchronous action registered in - * perform_synchronous_vdo_action(). - * - * @param completion The sync completion - **/ -static void run_synchronous_action(struct vdo_completion *completion) -{ - completion->callback = complete_synchronous_action; - as_sync_completion(completion)->action(completion); -} - -/**********************************************************************/ -int perform_synchronous_vdo_action(struct vdo *vdo, - vdo_action *action, - thread_id_t thread_id, - void *parent) -{ - struct sync_completion sync; - - initialize_vdo_completion(&sync.vdo_completion, vdo, VDO_SYNC_COMPLETION); - init_completion(&sync.completion); - sync.action = action; - launch_vdo_completion_callback_with_parent(&sync.vdo_completion, - run_synchronous_action, - thread_id, - parent); - wait_for_completion(&sync.completion); - return sync.vdo_completion.result; -} diff --git a/vdo/syncCompletion.h b/vdo/syncCompletion.h deleted file mode 100644 index 1c9093a5..00000000 --- a/vdo/syncCompletion.h +++ /dev/null @@ -1,41 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/syncCompletion.h#7 $ - */ - -#ifndef SYNC_COMPLETION_H -#define SYNC_COMPLETION_H - -#include "completion.h" -#include "types.h" - -/** - * Launch an action on a VDO thread and wait for it to complete. - * - * @param vdo The vdo - * @param action The callback to launch - * @param thread_id The thread on which to run the action - * @param parent The parent of the sync completion (may be NULL) - **/ -int perform_synchronous_vdo_action(struct vdo *vdo, - vdo_action *action, - thread_id_t thread_id, - void *parent); - -#endif // SYNC_COMPLETION_H diff --git a/vdo/sysfs.c b/vdo/sysfs.c index 732ffe85..415f51ab 100644 --- a/vdo/sysfs.c +++ b/vdo/sysfs.c @@ -1,60 +1,29 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/kernel/sysfs.c#13 $ */ -#include "sysfs.h" - #include -#include "dedupeIndex.h" -#include "dmvdo.h" #include "logger.h" -#include "vdoInit.h" - -static char *status_strings[] = { - "UNINITIALIZED", - "READY", - "SHUTTING DOWN", -}; -/**********************************************************************/ -static int vdo_status_show(char *buf, - const struct kernel_param *kp) -{ - return sprintf(buf, "%s\n", status_strings[vdo_module_status]); -} +#include "constants.h" +#include "dedupe-index.h" +#include "vdo.h" -/**********************************************************************/ static int vdo_log_level_show(char *buf, const struct kernel_param *kp) { return sprintf(buf, "%s\n", uds_log_priority_to_string(get_uds_log_level())); } -/**********************************************************************/ static int vdo_log_level_store(const char *buf, const struct kernel_param *kp) { static char internal_buf[11]; int n = strlen(buf); + if (n > 10) { return -EINVAL; } @@ -69,34 +38,30 @@ static int vdo_log_level_store(const char *buf, } -/**********************************************************************/ static int vdo_dedupe_timeout_interval_store(const char *buf, const struct kernel_param *kp) { int result = param_set_uint(buf, kp); + if (result != 0) { return result; } - set_vdo_dedupe_index_timeout_interval(*(uint *)kp->arg); + vdo_set_dedupe_index_timeout_interval(*(uint *)kp->arg); return 0; } -/**********************************************************************/ static int vdo_min_dedupe_timer_interval_store(const char *buf, const struct kernel_param *kp) { int result = param_set_uint(buf, kp); + if (result != 0) { return result; } - set_vdo_dedupe_index_min_timer_interval(*(uint *)kp->arg); + vdo_set_dedupe_index_min_timer_interval(*(uint *)kp->arg); return 0; } -static const struct kernel_param_ops status_ops = { - .get = vdo_status_show, -}; - static const struct kernel_param_ops log_level_ops = { .set = vdo_log_level_store, .get = vdo_log_level_show, @@ -113,8 +78,6 @@ static const struct kernel_param_ops dedupe_timer_ops = { .get = param_get_uint, }; -module_param_cb(status, &status_ops, NULL, 0444); - module_param_cb(log_level, &log_level_ops, NULL, 0644); diff --git a/vdo/thread-cond-var.c b/vdo/thread-cond-var.c new file mode 100644 index 00000000..3f8d7f2a --- /dev/null +++ b/vdo/thread-cond-var.c @@ -0,0 +1,56 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright Red Hat + */ + +#include "errors.h" +#include "time-utils.h" +#include "uds-threads.h" + +int uds_init_cond(struct cond_var *cv) +{ + cv->event_count = NULL; + return make_event_count(&cv->event_count); +} + +int uds_signal_cond(struct cond_var *cv) +{ + event_count_broadcast(cv->event_count); + return UDS_SUCCESS; +} + +int uds_broadcast_cond(struct cond_var *cv) +{ + event_count_broadcast(cv->event_count); + return UDS_SUCCESS; +} + +int uds_wait_cond(struct cond_var *cv, struct mutex *mutex) +{ + event_token_t token = event_count_prepare(cv->event_count); + + uds_unlock_mutex(mutex); + event_count_wait(cv->event_count, token, NULL); + uds_lock_mutex(mutex); + return UDS_SUCCESS; +} + +int uds_timed_wait_cond(struct cond_var *cv, + struct mutex *mutex, + ktime_t timeout) +{ + bool happened; + event_token_t token = event_count_prepare(cv->event_count); + + uds_unlock_mutex(mutex); + happened = event_count_wait(cv->event_count, token, &timeout); + uds_lock_mutex(mutex); + return happened ? UDS_SUCCESS : ETIMEDOUT; +} + +int uds_destroy_cond(struct cond_var *cv) +{ + free_event_count(cv->event_count); + cv->event_count = NULL; + return UDS_SUCCESS; +} diff --git a/vdo/thread-config.c b/vdo/thread-config.c new file mode 100644 index 00000000..cf430eb0 --- /dev/null +++ b/vdo/thread-config.c @@ -0,0 +1,280 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright Red Hat + */ + +#include "thread-config.h" + + +#include "logger.h" +#include "memory-alloc.h" +#include "permassert.h" + +#include "device-config.h" +#include "kernel-types.h" +#include "status-codes.h" +#include "types.h" + +static int allocate_thread_config(zone_count_t logical_zone_count, + zone_count_t physical_zone_count, + zone_count_t hash_zone_count, + zone_count_t bio_thread_count, + struct thread_config **config_ptr) +{ + struct thread_config *config; + int result = + UDS_ALLOCATE(1, struct thread_config, "thread config", &config); + if (result != VDO_SUCCESS) { + return result; + } + + result = UDS_ALLOCATE(logical_zone_count, + thread_id_t, + "logical thread array", + &config->logical_threads); + if (result != VDO_SUCCESS) { + vdo_free_thread_config(config); + return result; + } + + result = UDS_ALLOCATE(physical_zone_count, + thread_id_t, + "physical thread array", + &config->physical_threads); + if (result != VDO_SUCCESS) { + vdo_free_thread_config(config); + return result; + } + + result = UDS_ALLOCATE(hash_zone_count, + thread_id_t, + "hash thread array", + &config->hash_zone_threads); + if (result != VDO_SUCCESS) { + vdo_free_thread_config(config); + return result; + } + + result = UDS_ALLOCATE(bio_thread_count, + thread_id_t, + "bio thread array", + &config->bio_threads); + if (result != VDO_SUCCESS) { + vdo_free_thread_config(config); + return result; + } + + config->logical_zone_count = logical_zone_count; + config->physical_zone_count = physical_zone_count; + config->hash_zone_count = hash_zone_count; + config->bio_thread_count = bio_thread_count; + + *config_ptr = config; + return VDO_SUCCESS; +} + +static void assign_thread_ids(struct thread_config *config, + thread_id_t thread_ids[], + zone_count_t count) +{ + zone_count_t zone; + + for (zone = 0; zone < count; zone++) { + thread_ids[zone] = config->thread_count++; + } +} + +/** + * vdo_make_thread_config() - Make a thread configuration. + * @counts: The counts of each type of thread. + * @config_ptr: A pointer to hold the new thread configuration. + * + * If the logical, physical, and hash zone counts are all 0, a single + * thread will be shared by all three plus the packer and recovery + * journal. Otherwise, there must be at least one of each type, and + * each will have its own thread, as will the packer and recovery + * journal. + * + * Return: VDO_SUCCESS or an error. + */ +int vdo_make_thread_config(struct thread_count_config counts, + struct thread_config **config_ptr) +{ + int result; + struct thread_config *config; + + if ((counts.logical_zones + + counts.physical_zones + + counts.hash_zones) == 0) { + result = allocate_thread_config(1, + 1, + 1, + counts.bio_threads, + &config); + if (result != VDO_SUCCESS) { + return result; + } + + config->logical_threads[0] = config->thread_count; + config->physical_threads[0] = config->thread_count; + config->hash_zone_threads[0] = config->thread_count++; + } else { + result = allocate_thread_config(counts.logical_zones, + counts.physical_zones, + counts.hash_zones, + counts.bio_threads, + &config); + if (result != VDO_SUCCESS) { + return result; + } + + config->admin_thread = config->thread_count; + config->journal_thread = config->thread_count++; + config->packer_thread = config->thread_count++; + assign_thread_ids(config, + config->logical_threads, + counts.logical_zones); + assign_thread_ids(config, + config->physical_threads, + counts.physical_zones); + assign_thread_ids(config, + config->hash_zone_threads, + counts.hash_zones); + } + + config->dedupe_thread = config->thread_count++; + config->bio_ack_thread = ((counts.bio_ack_threads > 0) ? + config->thread_count++ + : VDO_INVALID_THREAD_ID); + config->cpu_thread = config->thread_count++; + assign_thread_ids(config, config->bio_threads, counts.bio_threads); + + *config_ptr = config; + return VDO_SUCCESS; +} + +/** + * vdo_free_thread_config() - Destroy a thread configuration. + * @config: The thread configuration to destroy. + */ +void vdo_free_thread_config(struct thread_config *config) +{ + if (config == NULL) { + return; + } + + UDS_FREE(UDS_FORGET(config->logical_threads)); + UDS_FREE(UDS_FORGET(config->physical_threads)); + UDS_FREE(UDS_FORGET(config->hash_zone_threads)); + UDS_FREE(UDS_FORGET(config->bio_threads)); + UDS_FREE(config); +} + +static bool get_zone_thread_name(const thread_id_t thread_ids[], + zone_count_t count, + thread_id_t id, + const char *prefix, + char *buffer, + size_t buffer_length) +{ + if (id >= thread_ids[0]) { + thread_id_t index = id - thread_ids[0]; + + if (index < count) { + snprintf(buffer, buffer_length, "%s%d", prefix, index); + return true; + } + } + return false; +} + +/** + * vdo_get_thread_name() - Format the name of the worker thread + * desired to support a given work queue. + * @thread_config: The thread configuration. + * @thread_id: The thread id. + * @buffer: Where to put the formatted name. + * @buffer_length: Size of the output buffer. + * + * The physical layer may add a prefix identifying the product; the + * output from this function should just identify the thread. + */ +void vdo_get_thread_name(const struct thread_config *thread_config, + thread_id_t thread_id, + char *buffer, + size_t buffer_length) +{ + if (thread_id == thread_config->journal_thread) { + if (thread_config->packer_thread == thread_id) { + /* + * This is the "single thread" config where one thread + * is used for the journal, packer, logical, physical, + * and hash zones. In that case, it is known as the + * "request queue." + */ + snprintf(buffer, buffer_length, "reqQ"); + return; + } + + snprintf(buffer, buffer_length, "journalQ"); + return; + } else if (thread_id == thread_config->admin_thread) { + /* + * Theoretically this could be different from the journal + * thread. + */ + snprintf(buffer, buffer_length, "adminQ"); + return; + } else if (thread_id == thread_config->packer_thread) { + snprintf(buffer, buffer_length, "packerQ"); + return; + } else if (thread_id == thread_config->dedupe_thread) { + snprintf(buffer, buffer_length, "dedupeQ"); + return; + } else if (thread_id == thread_config->bio_ack_thread) { + snprintf(buffer, buffer_length, "ackQ"); + return; + } else if (thread_id == thread_config->cpu_thread) { + snprintf(buffer, buffer_length, "cpuQ"); + return; + } + + if (get_zone_thread_name(thread_config->logical_threads, + thread_config->logical_zone_count, + thread_id, + "logQ", + buffer, + buffer_length)) { + return; + } + + if (get_zone_thread_name(thread_config->physical_threads, + thread_config->physical_zone_count, + thread_id, + "physQ", + buffer, + buffer_length)) { + return; + } + + if (get_zone_thread_name(thread_config->hash_zone_threads, + thread_config->hash_zone_count, + thread_id, + "hashQ", + buffer, + buffer_length)) { + return; + } + + if (get_zone_thread_name(thread_config->bio_threads, + thread_config->bio_thread_count, + thread_id, + "bioQ", + buffer, + buffer_length)) { + return; + } + + /* Some sort of misconfiguration? */ + snprintf(buffer, buffer_length, "reqQ%d", thread_id); +} diff --git a/vdo/thread-config.h b/vdo/thread-config.h new file mode 100644 index 00000000..da14b267 --- /dev/null +++ b/vdo/thread-config.h @@ -0,0 +1,92 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright Red Hat + */ + +#ifndef THREAD_CONFIG_H +#define THREAD_CONFIG_H + +#include "permassert.h" + +#include "kernel-types.h" +#include "types.h" + +struct thread_config { + zone_count_t logical_zone_count; + zone_count_t physical_zone_count; + zone_count_t hash_zone_count; + thread_count_t bio_thread_count; + thread_count_t thread_count; + thread_id_t admin_thread; + thread_id_t journal_thread; + thread_id_t packer_thread; + thread_id_t dedupe_thread; + thread_id_t bio_ack_thread; + thread_id_t cpu_thread; + thread_id_t *logical_threads; + thread_id_t *physical_threads; + thread_id_t *hash_zone_threads; + thread_id_t *bio_threads; +}; + +int __must_check +vdo_make_thread_config(struct thread_count_config counts, + struct thread_config **config_ptr); + +void vdo_free_thread_config(struct thread_config *config); + +/** + * vdo_get_logical_zone_thread() - Get the thread id for a given logical zone. + * @thread_config: The thread config. + * @logical_zone: The number of the logical zone. + * + * Return: The thread id for the given zone. + */ +static inline thread_id_t __must_check +vdo_get_logical_zone_thread(const struct thread_config *thread_config, + zone_count_t logical_zone) +{ + ASSERT_LOG_ONLY((logical_zone <= thread_config->logical_zone_count), + "logical zone valid"); + return thread_config->logical_threads[logical_zone]; +} + +/** + * vdo_get_physical_zone_thread() - Get the thread id for a given physical + * zone. + * @thread_config: The thread config. + * @physical_zone: The number of the physical zone. + * + * Return: The thread id for the given zone. + */ +static inline thread_id_t __must_check +vdo_get_physical_zone_thread(const struct thread_config *thread_config, + zone_count_t physical_zone) +{ + ASSERT_LOG_ONLY((physical_zone <= thread_config->physical_zone_count), + "physical zone valid"); + return thread_config->physical_threads[physical_zone]; +} + +/** + * vdo_get_hash_zone_thread() - Get the thread id for a given hash zone. + * @thread_config: The thread config. + * @hash_zone: The number of the hash zone. + * + * Return: The thread id for the given zone. + */ +static inline thread_id_t __must_check +vdo_get_hash_zone_thread(const struct thread_config *thread_config, + zone_count_t hash_zone) +{ + ASSERT_LOG_ONLY((hash_zone <= thread_config->hash_zone_count), + "hash zone valid"); + return thread_config->hash_zone_threads[hash_zone]; +} + +void vdo_get_thread_name(const struct thread_config *thread_config, + thread_id_t thread_id, + char *buffer, + size_t buffer_length); + +#endif /* THREAD_CONFIG_H */ diff --git a/vdo/thread-device.c b/vdo/thread-device.c new file mode 100644 index 00000000..35db517f --- /dev/null +++ b/vdo/thread-device.c @@ -0,0 +1,41 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright Red Hat + */ + +#include "thread-device.h" + +#include "thread-registry.h" + +/* A registry of threads associated with device id numbers. */ +static struct thread_registry device_id_thread_registry; + +/* + * Associate the current thread with a device id number for logging. + * + * A registered thread must be unregistered via + * unregister_thread_device_id. + */ +void uds_register_thread_device_id(struct registered_thread *new_thread, + unsigned int *id_ptr) +{ + uds_register_thread(&device_id_thread_registry, new_thread, id_ptr); +} + +void uds_unregister_thread_device_id(void) +{ + uds_unregister_thread(&device_id_thread_registry); +} + +int uds_get_thread_device_id(void) +{ + const unsigned int *pointer = + uds_lookup_thread(&device_id_thread_registry); + + return pointer ? *pointer : -1; +} + +void uds_initialize_thread_device_registry(void) +{ + uds_initialize_thread_registry(&device_id_thread_registry); +} diff --git a/vdo/thread-device.h b/vdo/thread-device.h new file mode 100644 index 00000000..264876f4 --- /dev/null +++ b/vdo/thread-device.h @@ -0,0 +1,20 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright Red Hat + */ + +#ifndef UDS_THREAD_DEVICE_H +#define UDS_THREAD_DEVICE_H + +#include "thread-registry.h" + +void uds_register_thread_device_id(struct registered_thread *new_thread, + unsigned int *id_ptr); + +void uds_unregister_thread_device_id(void); + +int uds_get_thread_device_id(void); + +void uds_initialize_thread_device_registry(void); + +#endif /* UDS_THREAD_DEVICE_H */ diff --git a/uds/threadRegistry.c b/vdo/thread-registry.c similarity index 57% rename from uds/threadRegistry.c rename to vdo/thread-registry.c index fa666bc4..5483694b 100644 --- a/uds/threadRegistry.c +++ b/vdo/thread-registry.c @@ -1,25 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/krusty/kernelLinux/uds/threadRegistry.c#10 $ */ -#include "threadRegistry.h" +#include "thread-registry.h" #include @@ -27,18 +11,17 @@ /* * We need to be careful when using other facilities that may use - * threadRegistry functions in their normal operation. For example, + * thread registry functions in their normal operation. For example, * we do not want to invoke the logger while holding a lock. */ -/**********************************************************************/ void uds_initialize_thread_registry(struct thread_registry *registry) { INIT_LIST_HEAD(®istry->links); spin_lock_init(®istry->lock); } -/**********************************************************************/ +/* Register the current thread and associate it with a data pointer. */ void uds_register_thread(struct thread_registry *registry, struct registered_thread *new_thread, const void *pointer) @@ -53,8 +36,10 @@ void uds_register_thread(struct thread_registry *registry, spin_lock(®istry->lock); list_for_each_entry(thread, ®istry->links, links) { if (thread->task == current) { - // This should not have been there. - // We'll complain after releasing the lock. + /* + * This should not have been there. + * We'll complain after releasing the lock. + */ list_del_rcu(&thread->links); found_it = true; break; @@ -65,13 +50,12 @@ void uds_register_thread(struct thread_registry *registry, ASSERT_LOG_ONLY(!found_it, "new thread not already in registry"); if (found_it) { - // Ensure no RCU iterators see it before re-initializing. + /* Ensure no RCU iterators see it before re-initializing. */ synchronize_rcu(); INIT_LIST_HEAD(&thread->links); } } -/**********************************************************************/ void uds_unregister_thread(struct thread_registry *registry) { struct registered_thread *thread; @@ -89,13 +73,12 @@ void uds_unregister_thread(struct thread_registry *registry) ASSERT_LOG_ONLY(found_it, "thread found in registry"); if (found_it) { - // Ensure no RCU iterators see it before re-initializing. + /* Ensure no RCU iterators see it before re-initializing. */ synchronize_rcu(); INIT_LIST_HEAD(&thread->links); } } -/**********************************************************************/ const void *uds_lookup_thread(struct thread_registry *registry) { struct registered_thread *thread; diff --git a/vdo/thread-registry.h b/vdo/thread-registry.h new file mode 100644 index 00000000..6fb78e44 --- /dev/null +++ b/vdo/thread-registry.h @@ -0,0 +1,33 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright Red Hat + */ + +#ifndef THREAD_REGISTRY_H +#define THREAD_REGISTRY_H + +#include +#include + +struct thread_registry { + struct list_head links; + spinlock_t lock; +}; + +struct registered_thread { + struct list_head links; + const void *pointer; + struct task_struct *task; +}; + +void uds_initialize_thread_registry(struct thread_registry *registry); + +void uds_register_thread(struct thread_registry *registry, + struct registered_thread *new_thread, + const void *pointer); + +void uds_unregister_thread(struct thread_registry *registry); + +const void *uds_lookup_thread(struct thread_registry *registry); + +#endif /* THREAD_REGISTRY_H */ diff --git a/vdo/threadConfig.c b/vdo/threadConfig.c deleted file mode 100644 index fbde440f..00000000 --- a/vdo/threadConfig.c +++ /dev/null @@ -1,243 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/threadConfig.c#20 $ - */ - -#include "threadConfig.h" - -#include "logger.h" -#include "memoryAlloc.h" -#include "permassert.h" - -#include "constants.h" -#include "statusCodes.h" -#include "types.h" - -/**********************************************************************/ -static int allocate_thread_config(zone_count_t logical_zone_count, - zone_count_t physical_zone_count, - zone_count_t hash_zone_count, - zone_count_t base_thread_count, - struct thread_config **config_ptr) -{ - struct thread_config *config; - int result = - UDS_ALLOCATE(1, struct thread_config, "thread config", &config); - if (result != VDO_SUCCESS) { - return result; - } - - result = UDS_ALLOCATE(logical_zone_count, - thread_id_t, - "logical thread array", - &config->logical_threads); - if (result != VDO_SUCCESS) { - free_vdo_thread_config(config); - return result; - } - - result = UDS_ALLOCATE(physical_zone_count, - thread_id_t, - "physical thread array", - &config->physical_threads); - if (result != VDO_SUCCESS) { - free_vdo_thread_config(config); - return result; - } - - result = UDS_ALLOCATE(hash_zone_count, - thread_id_t, - "hash thread array", - &config->hash_zone_threads); - if (result != VDO_SUCCESS) { - free_vdo_thread_config(config); - return result; - } - - config->logical_zone_count = logical_zone_count; - config->physical_zone_count = physical_zone_count; - config->hash_zone_count = hash_zone_count; - config->base_thread_count = base_thread_count; - - *config_ptr = config; - return VDO_SUCCESS; -} - -/**********************************************************************/ -static void -assign_thread_ids(thread_id_t thread_ids[], zone_count_t count, - thread_id_t *id_ptr) -{ - zone_count_t zone; - for (zone = 0; zone < count; zone++) { - thread_ids[zone] = (*id_ptr)++; - } -} - -/**********************************************************************/ -int make_vdo_thread_config(zone_count_t logical_zone_count, - zone_count_t physical_zone_count, - zone_count_t hash_zone_count, - struct thread_config **config_ptr) -{ - struct thread_config *config; - thread_count_t total; - int result; - thread_id_t id = 0; - - if ((logical_zone_count == 0) && (physical_zone_count == 0) && - (hash_zone_count == 0)) { - return vdo_make_one_thread_config(config_ptr); - } - - if (physical_zone_count > MAX_VDO_PHYSICAL_ZONES) { - return uds_log_error_strerror(VDO_BAD_CONFIGURATION, - "Physical zone count %u exceeds maximum (%u)", - physical_zone_count, - MAX_VDO_PHYSICAL_ZONES); - } - - if (logical_zone_count > MAX_VDO_LOGICAL_ZONES) { - return uds_log_error_strerror(VDO_BAD_CONFIGURATION, - "Logical zone count %u exceeds maximum (%u)", - logical_zone_count, - MAX_VDO_LOGICAL_ZONES); - } - - total = logical_zone_count + physical_zone_count + hash_zone_count + 2; - result = allocate_thread_config(logical_zone_count, - physical_zone_count, - hash_zone_count, - total, - &config); - if (result != VDO_SUCCESS) { - return result; - } - - config->admin_thread = id; - config->journal_thread = id++; - config->packer_thread = id++; - assign_thread_ids(config->logical_threads, logical_zone_count, &id); - assign_thread_ids(config->physical_threads, physical_zone_count, &id); - assign_thread_ids(config->hash_zone_threads, hash_zone_count, &id); - - ASSERT_LOG_ONLY(id == total, "correct number of thread IDs assigned"); - - *config_ptr = config; - return VDO_SUCCESS; -} - -/**********************************************************************/ -int vdo_make_one_thread_config(struct thread_config **config_ptr) -{ - struct thread_config *config; - int result = allocate_thread_config(1, 1, 1, 1, &config); - if (result != VDO_SUCCESS) { - return result; - } - - config->logical_threads[0] = 0; - config->physical_threads[0] = 0; - config->hash_zone_threads[0] = 0; - *config_ptr = config; - return VDO_SUCCESS; -} - -/**********************************************************************/ -void free_vdo_thread_config(struct thread_config *config) -{ - if (config == NULL) { - return; - } - - UDS_FREE(UDS_FORGET(config->logical_threads)); - UDS_FREE(UDS_FORGET(config->physical_threads)); - UDS_FREE(UDS_FORGET(config->hash_zone_threads)); - UDS_FREE(config); -} - -/**********************************************************************/ -static bool get_zone_thread_name(const thread_id_t thread_ids[], - zone_count_t count, - thread_id_t id, - const char *prefix, - char *buffer, - size_t buffer_length) -{ - if (id >= thread_ids[0]) { - thread_id_t index = id - thread_ids[0]; - if (index < count) { - snprintf(buffer, buffer_length, "%s%d", prefix, index); - return true; - } - } - return false; -} - -/**********************************************************************/ -void vdo_get_thread_name(const struct thread_config *thread_config, - thread_id_t thread_id, - char *buffer, - size_t buffer_length) -{ - if (thread_config->base_thread_count == 1) { - // Historically this was the "request queue" thread. - snprintf(buffer, buffer_length, "reqQ"); - return; - } - if (thread_id == thread_config->journal_thread) { - snprintf(buffer, buffer_length, "journalQ"); - return; - } else if (thread_id == thread_config->admin_thread) { - // Theoretically this could be different from the journal - // thread. - snprintf(buffer, buffer_length, "adminQ"); - return; - } else if (thread_id == thread_config->packer_thread) { - snprintf(buffer, buffer_length, "packerQ"); - return; - } - if (get_zone_thread_name(thread_config->logical_threads, - thread_config->logical_zone_count, - thread_id, - "logQ", - buffer, - buffer_length)) { - return; - } - if (get_zone_thread_name(thread_config->physical_threads, - thread_config->physical_zone_count, - thread_id, - "physQ", - buffer, - buffer_length)) { - return; - } - if (get_zone_thread_name(thread_config->hash_zone_threads, - thread_config->hash_zone_count, - thread_id, - "hashQ", - buffer, - buffer_length)) { - return; - } - - // Some sort of misconfiguration? - snprintf(buffer, buffer_length, "reqQ%d", thread_id); -} diff --git a/vdo/threadConfig.h b/vdo/threadConfig.h deleted file mode 100644 index 9b570117..00000000 --- a/vdo/threadConfig.h +++ /dev/null @@ -1,143 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/threadConfig.h#13 $ - */ - -#ifndef THREAD_CONFIG_H -#define THREAD_CONFIG_H - -#include "permassert.h" - -#include "types.h" - -struct thread_config { - zone_count_t logical_zone_count; - zone_count_t physical_zone_count; - zone_count_t hash_zone_count; - thread_count_t base_thread_count; - thread_id_t admin_thread; - thread_id_t journal_thread; - thread_id_t packer_thread; - thread_id_t *logical_threads; - thread_id_t *physical_threads; - thread_id_t *hash_zone_threads; -}; - -/** - * Make a thread configuration. If both the logical zone count and the - * physical zone count are set to 0, a one thread configuration will be - * made. - * - * @param [in] logical_zone_count The number of logical zones - * @param [in] physical_zone_count The number of physical zones - * @param [in] hash_zone_count The number of hash zones - * @param [out] config_ptr A pointer to hold the new thread - * configuration - * - * @return VDO_SUCCESS or an error - **/ -int __must_check make_vdo_thread_config(zone_count_t logical_zone_count, - zone_count_t physical_zone_count, - zone_count_t hash_zone_count, - struct thread_config **config_ptr); - -/** - * Make a thread configuration that uses only one thread. - * - * @param [out] config_ptr A pointer to hold the new thread configuration - * - * @return VDO_SUCCESS or an error - **/ -int __must_check vdo_make_one_thread_config(struct thread_config **config_ptr); - -/** - * Destroy a thread configuration. - * - * @param config The thread configuration to destroy - **/ -void free_vdo_thread_config(struct thread_config *config); - -/** - * Get the thread id for a given logical zone. - * - * @param thread_config the thread config - * @param logical_zone the number of the logical zone - * - * @return the thread id for the given zone - **/ -static inline thread_id_t __must_check -vdo_get_logical_zone_thread(const struct thread_config *thread_config, - zone_count_t logical_zone) -{ - ASSERT_LOG_ONLY((logical_zone <= thread_config->logical_zone_count), - "logical zone valid"); - return thread_config->logical_threads[logical_zone]; -} - -/** - * Get the thread id for a given physical zone. - * - * @param thread_config the thread config - * @param physical_zone the number of the physical zone - * - * @return the thread id for the given zone - **/ -static inline thread_id_t __must_check -vdo_get_physical_zone_thread(const struct thread_config *thread_config, - zone_count_t physical_zone) -{ - ASSERT_LOG_ONLY((physical_zone <= thread_config->physical_zone_count), - "physical zone valid"); - return thread_config->physical_threads[physical_zone]; -} - -/** - * Get the thread id for a given hash zone. - * - * @param thread_config the thread config - * @param hash_zone the number of the hash zone - * - * @return the thread id for the given zone - **/ -static inline thread_id_t __must_check -vdo_get_hash_zone_thread(const struct thread_config *thread_config, - zone_count_t hash_zone) -{ - ASSERT_LOG_ONLY((hash_zone <= thread_config->hash_zone_count), - "hash zone valid"); - return thread_config->hash_zone_threads[hash_zone]; -} - -/** - * Format the name of the worker thread desired to support a given - * work queue. The physical layer may add a prefix identifying the - * product; the output from this function should just identify the - * thread. - * - * @param thread_config The thread configuration - * @param thread_id The thread id - * @param buffer Where to put the formatted name - * @param buffer_length Size of the output buffer - **/ -void vdo_get_thread_name(const struct thread_config *thread_config, - thread_id_t thread_id, - char *buffer, - size_t buffer_length); - -#endif /* THREAD_CONFIG_H */ diff --git a/vdo/time-utils.c b/vdo/time-utils.c new file mode 100644 index 00000000..bbb37f00 --- /dev/null +++ b/vdo/time-utils.c @@ -0,0 +1,20 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright Red Hat + */ + +#include "permassert.h" +#include "string-utils.h" +#include "time-utils.h" + +#include +#include + + +int64_t current_time_us(void) +{ + return current_time_ns(CLOCK_REALTIME) / NSEC_PER_USEC; +} + + + diff --git a/uds/timeUtils.h b/vdo/time-utils.h similarity index 63% rename from uds/timeUtils.h rename to vdo/time-utils.h index e6b32815..d58a21e4 100644 --- a/uds/timeUtils.h +++ b/vdo/time-utils.h @@ -1,34 +1,18 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/krusty/src/uds/timeUtils.h#17 $ */ #ifndef TIME_UTILS_H #define TIME_UTILS_H #include "compiler.h" -#include "typeDefs.h" +#include "type-defs.h" #include #include -// Some constants that are defined in kernel headers. +/* Some constants that are defined in kernel headers. */ /** * Return the current nanosecond time according to the specified clock @@ -40,7 +24,7 @@ **/ static INLINE ktime_t current_time_ns(clockid_t clock) { - // clock is always a constant, so gcc reduces this to a single call + /* clock is always a constant, so gcc reduces this to a single call */ return clock == CLOCK_MONOTONIC ? ktime_get_ns() : ktime_get_real_ns(); } diff --git a/vdo/type-defs.h b/vdo/type-defs.h new file mode 100644 index 00000000..b19ed944 --- /dev/null +++ b/vdo/type-defs.h @@ -0,0 +1,26 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright Red Hat + */ + +#ifndef TYPE_DEFS_H +#define TYPE_DEFS_H + +/* + * General system type definitions. + */ + +#include +#include +#include + +typedef unsigned char byte; + +#define CHAR_BIT 8 + +#define INT64_MAX (9223372036854775807L) +#define UCHAR_MAX ((unsigned char)~0ul) +#define UINT8_MAX ((uint8_t)~0ul) +#define UINT16_MAX ((uint16_t)~0ul) +#define UINT64_MAX ((uint64_t)~0ul) +#endif /* TYPE_DEFS_H */ diff --git a/vdo/types.h b/vdo/types.h index 01f2448d..ad943526 100644 --- a/vdo/types.h +++ b/vdo/types.h @@ -1,357 +1,159 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/types.h#16 $ */ #ifndef TYPES_H #define TYPES_H -#include "blockMappingState.h" -#include "common.h" +#include "compiler.h" +#include "type-defs.h" +#include "block-mapping-state.h" -/** - * A size type in blocks. - **/ +/* A size type in blocks. */ typedef uint64_t block_count_t; -/** - * The size of a block. - **/ +/* The size of a block. */ typedef uint16_t block_size_t; -/** - * A count of compressed fragments - **/ -typedef uint8_t compressed_fragment_count_t; - -/** - * A height within a tree. - **/ +/* A height within a tree. */ typedef uint8_t height_t; -/** - * The logical block number as used by the consumer. - **/ +/* The logical block number as used by the consumer. */ typedef uint64_t logical_block_number_t; -/** - * The type of the nonce used to identify instances of VDO. - **/ +/* The type of the nonce used to identify instances of VDO. */ typedef uint64_t nonce_t; -/** - * A size in pages. - **/ +/* A size in pages. */ typedef uint32_t page_count_t; -/** - * A page number. - **/ +/* A page number. */ typedef uint32_t page_number_t; -/** - * The size of a page. Must be evenly divisible by block size. - **/ -typedef uint32_t page_size_t; - -/** +/* * The physical (well, less logical) block number at which the block is found * on the underlying device. - **/ + */ typedef uint64_t physical_block_number_t; -/** +/* * A release version number. These numbers are used to make the numbering * space for component versions independent across release branches. * * Really an enum, but we have to specify the size for encoding; see - * releaseVersions.h for the enumeration values. - **/ + * release_versions.h for the enumeration values. + */ typedef uint32_t release_version_number_t; -/** - * A count of tree roots. - **/ +/* A count of tree roots. */ typedef uint8_t root_count_t; -/** - * A number of sectors. - **/ +/* A number of sectors. */ typedef uint8_t sector_count_t; -/** - * A sequence number. - **/ +/* A sequence number. */ typedef uint64_t sequence_number_t; -/** - * The offset of a block within a slab. - **/ +/* The offset of a block within a slab. */ typedef uint32_t slab_block_number; -/** - * A size type in slabs. - **/ +/* A size type in slabs. */ typedef uint16_t slab_count_t; -/** - * A slot in a bin or block map page. - **/ +/* A slot in a bin or block map page. */ typedef uint16_t slot_number_t; -/** - * A number of vios. - **/ -typedef uint16_t vio_count_t; - -/** - * A VDO thread configuration. - **/ -struct thread_config; - -/** - * A thread counter - **/ -typedef uint8_t thread_count_t; - -/** - * A thread ID - * - * Base-code threads are numbered sequentially starting from 0. - **/ -typedef uint8_t thread_id_t; - -/** - * The thread ID returned when the current base code thread ID cannot be found - * or is otherwise undefined. - **/ -static const thread_id_t VDO_INVALID_THREAD_ID = (thread_id_t) -1; - -/** - * A zone counter - **/ +/* A zone counter */ typedef uint8_t zone_count_t; -/** - * The type of request a vio is performing - **/ -enum vio_operation { - VIO_UNSPECIFIED_OPERATION = 0, - VIO_READ = 1, - VIO_WRITE = 2, - VIO_READ_MODIFY_WRITE = VIO_READ | VIO_WRITE, - VIO_READ_WRITE_MASK = VIO_READ_MODIFY_WRITE, - VIO_FLUSH_BEFORE = 4, - VIO_FLUSH_AFTER = 8, -} __packed; +/* + * The following enums are persisted on storage, so the values must be + * preserved. + */ -/** - * vio types for statistics and instrumentation. - **/ -enum vio_type { - VIO_TYPE_UNINITIALIZED = 0, - VIO_TYPE_DATA, - VIO_TYPE_BLOCK_ALLOCATOR, - VIO_TYPE_BLOCK_MAP, - VIO_TYPE_BLOCK_MAP_INTERIOR, - VIO_TYPE_COMPRESSED_BLOCK, - VIO_TYPE_PARTITION_COPY, - VIO_TYPE_RECOVERY_JOURNAL, - VIO_TYPE_SLAB_JOURNAL, - VIO_TYPE_SLAB_SUMMARY, - VIO_TYPE_SUPER_BLOCK, - VIO_TYPE_TEST, -} __packed; +/* The current operating mode of the VDO. */ +enum vdo_state { + VDO_DIRTY = 0, + VDO_NEW = 1, + VDO_CLEAN = 2, + VDO_READ_ONLY_MODE = 3, + VDO_FORCE_REBUILD = 4, + VDO_RECOVERING = 5, + VDO_REPLAYING = 6, + VDO_REBUILD_FOR_UPGRADE = 7, + + /* Keep VDO_STATE_COUNT at the bottom. */ + VDO_STATE_COUNT +}; -/** +/* * The current operation on a physical block (from the point of view of the * recovery journal, slab journals, and reference counts. - **/ + */ enum journal_operation { - DATA_DECREMENT = 0, - DATA_INCREMENT = 1, - BLOCK_MAP_DECREMENT = 2, - BLOCK_MAP_INCREMENT = 3, + VDO_JOURNAL_DATA_DECREMENT = 0, + VDO_JOURNAL_DATA_INCREMENT = 1, + VDO_JOURNAL_BLOCK_MAP_DECREMENT = 2, + VDO_JOURNAL_BLOCK_MAP_INCREMENT = 3, } __packed; -/** - * Partition IDs are encoded in the volume layout in the super block. - **/ +/* Partition IDs encoded in the volume layout in the super block. */ enum partition_id { - BLOCK_MAP_PARTITION = 0, - BLOCK_ALLOCATOR_PARTITION = 1, - RECOVERY_JOURNAL_PARTITION = 2, - SLAB_SUMMARY_PARTITION = 3, + VDO_BLOCK_MAP_PARTITION = 0, + VDO_BLOCK_ALLOCATOR_PARTITION = 1, + VDO_RECOVERY_JOURNAL_PARTITION = 2, + VDO_SLAB_SUMMARY_PARTITION = 3, } __packed; -/** - * Check whether a vio_type is for servicing an external data request. - * - * @param type The vio_type to check - **/ -static inline bool is_vdo_data_vio_type(enum vio_type type) -{ - return (type == VIO_TYPE_DATA); -} - -/** - * Check whether a vio_type is for compressed block writes - * - * @param type The vio_type to check - **/ -static inline bool is_vdo_compressed_write_vio_type(enum vio_type type) -{ - return (type == VIO_TYPE_COMPRESSED_BLOCK); -} - -/** - * Check whether a vio_type is for metadata - * - * @param type The vio_type to check - **/ -static inline bool is_vdo_metadata_vio_type(enum vio_type type) -{ - return ((type != VIO_TYPE_UNINITIALIZED) && - !is_vdo_data_vio_type(type) && - !is_vdo_compressed_write_vio_type(type)); -} - -/** - * Priority levels for asynchronous I/O operations performed on a vio. - **/ -enum vio_priority { - VIO_PRIORITY_LOW = 0, - VIO_PRIORITY_DATA = VIO_PRIORITY_LOW, - VIO_PRIORITY_COMPRESSED_DATA = VIO_PRIORITY_DATA, - VIO_PRIORITY_METADATA, - VIO_PRIORITY_HIGH, -} __packed; - -/** - * Metadata types for the vdo. - **/ +/* Metadata types for the vdo. */ enum vdo_metadata_type { VDO_METADATA_RECOVERY_JOURNAL = 1, VDO_METADATA_SLAB_JOURNAL, } __packed; -enum vdo_zone_type { - ZONE_TYPE_ADMIN, - ZONE_TYPE_JOURNAL, - ZONE_TYPE_LOGICAL, - ZONE_TYPE_PHYSICAL, -}; - -/** - * A position in the block map where a block map entry is stored. - **/ +/* A position in the block map where a block map entry is stored. */ struct block_map_slot { physical_block_number_t pbn; slot_number_t slot; }; -/** +struct data_location { + physical_block_number_t pbn; + enum block_mapping_state state; +}; + +/* * The configuration of a single slab derived from the configured block size * and slab size. - **/ + */ struct slab_config { - /** total number of blocks in the slab */ + /* total number of blocks in the slab */ block_count_t slab_blocks; - /** number of blocks available for data */ + /* number of blocks available for data */ block_count_t data_blocks; - /** number of blocks for reference counts */ + /* number of blocks for reference counts */ block_count_t reference_count_blocks; - /** number of blocks for the slab journal */ + /* number of blocks for the slab journal */ block_count_t slab_journal_blocks; - /** + /* * Number of blocks after which the slab journal starts pushing out a * reference_block for each new entry it receives. - **/ + */ block_count_t slab_journal_flushing_threshold; - /** + /* * Number of blocks after which the slab journal pushes out all * reference_blocks and makes all vios wait. - **/ + */ block_count_t slab_journal_blocking_threshold; - /** + /* * Number of blocks after which the slab must be scrubbed before coming * online. - **/ + */ block_count_t slab_journal_scrubbing_threshold; } __packed; -/** - * Forward declarations of abstract types - **/ -struct action_manager; -struct allocating_vio; -struct allocation_selector; -struct block_allocator; -struct block_map; -struct block_map_tree_zone; -struct block_map_zone; -struct data_vio; -struct flusher; -struct forest; -struct hash_lock; -struct hash_zone; -struct index_config; -struct input_bin; -struct io_submitter; -struct lbn_lock; -struct lock_counter; -struct logical_zone; -struct logical_zones; -struct pbn_lock; -typedef struct physicalLayer PhysicalLayer; -struct physical_zone; -struct recovery_journal; -struct read_only_notifier; -struct ref_counts; -struct vdo_slab; -struct slab_depot; -struct slab_journal; -struct slab_journal_entry; -struct slab_scrubber; -struct slab_summary; -struct slab_summary_zone; -struct vdo; -struct vdo_completion; struct vdo_config; -struct vdo_extent; -struct vdo_flush; -struct vdo_layout; -struct vdo_statistics; -struct vdo_work_item; -struct vio; -struct vio_pool; - -struct data_location { - physical_block_number_t pbn; - enum block_mapping_state state; -}; - -struct zoned_pbn { - physical_block_number_t pbn; - enum block_mapping_state state; - struct physical_zone *zone; -}; -#endif // TYPES_H +#endif /* TYPES_H */ diff --git a/uds/sysfs.c b/vdo/uds-sysfs.c similarity index 54% rename from uds/sysfs.c rename to vdo/uds-sysfs.c index abc250fa..0bfb8281 100644 --- a/uds/sysfs.c +++ b/vdo/uds-sysfs.c @@ -1,47 +1,34 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/krusty/kernelLinux/uds/sysfs.c#18 $ */ -#include "sysfs.h" +#include "uds-sysfs.h" #include #include #include #include "logger.h" -#include "memoryAlloc.h" -#include "stringUtils.h" +#include "memory-alloc.h" +#include "string-utils.h" #include "uds.h" +#define UDS_SYSFS_NAME "uds" + static struct { - struct kobject kobj; // /sys/uds - struct kobject parameter_kobj; // /sys/uds/parameter - // These flags are used to ensure a clean shutdown - bool flag; // /sys/uds - bool parameter_flag; // /sys/uds/parameter + struct kobject kobj; /* /sys/uds */ + struct kobject parameter_kobj; /* /sys/uds/parameter */ + + /* These flags are used to ensure a clean shutdown */ + bool flag; /* /sys/uds */ + bool parameter_flag; /* /sys/uds/parameter */ } object_root; -/**********************************************************************/ static char *buffer_to_string(const char *buf, size_t length) { char *string; + if (UDS_ALLOCATE(length + 1, char, __func__, &string) != UDS_SUCCESS) { return NULL; } @@ -53,25 +40,22 @@ static char *buffer_to_string(const char *buf, size_t length) return string; } -/**********************************************************************/ -// This is the code for a directory in the /sys/ tree that -// contains no regular files (only subdirectories). -/**********************************************************************/ +/* + * This is the code for a directory in the /sys/ tree that + * contains no regular files (only subdirectories). + */ -/**********************************************************************/ static void empty_release(struct kobject *kobj) { - // Many of our sysfs share this release function that does nothing. + /* Many of our sysfs share this release function that does nothing. */ } -/**********************************************************************/ static ssize_t empty_show(struct kobject *kobj, struct attribute *attr, char *buf) { return 0; } -/**********************************************************************/ static ssize_t empty_store(struct kobject *kobj, struct attribute *attr, const char *buf, @@ -88,20 +72,19 @@ static struct sysfs_ops empty_ops = { static struct attribute *empty_attrs[] = { NULL, }; +ATTRIBUTE_GROUPS(empty); static struct kobj_type empty_object_type = { .release = empty_release, .sysfs_ops = &empty_ops, - .default_attrs = empty_attrs, + .default_groups = empty_groups, }; -/**********************************************************************/ -// This is the the code for the /sys//parameter directory. -// -//

/log_level UDS_LOG_LEVEL -// -/**********************************************************************/ +/* + * This is the code for the /sys//parameter directory. + * /log_level UDS_LOG_LEVEL + */ struct parameter_attribute { struct attribute attr; @@ -109,7 +92,6 @@ struct parameter_attribute { void (*store_string)(const char *); }; -/**********************************************************************/ static ssize_t parameter_show(struct kobject *kobj, struct attribute *attr, char *buf) { @@ -122,7 +104,6 @@ parameter_show(struct kobject *kobj, struct attribute *attr, char *buf) } } -/**********************************************************************/ static ssize_t parameter_store(struct kobject *kobj, struct attribute *attr, const char *buf, @@ -143,21 +124,18 @@ static ssize_t parameter_store(struct kobject *kobj, return length; } -/**********************************************************************/ static const char *parameter_show_log_level(void) { return uds_log_priority_to_string(get_uds_log_level()); } -/**********************************************************************/ static void parameter_store_log_level(const char *string) { set_uds_log_level(uds_log_string_to_priority(string)); } -/**********************************************************************/ static struct parameter_attribute log_level_attr = { .attr = { .name = "log_level", .mode = 0600 }, @@ -169,6 +147,7 @@ static struct attribute *parameter_attrs[] = { &log_level_attr.attr, NULL, }; +ATTRIBUTE_GROUPS(parameter); static struct sysfs_ops parameter_ops = { .show = parameter_show, @@ -178,16 +157,16 @@ static struct sysfs_ops parameter_ops = { static struct kobj_type parameter_object_type = { .release = empty_release, .sysfs_ops = ¶meter_ops, - .default_attrs = parameter_attrs, + .default_groups = parameter_groups, }; -/**********************************************************************/ -int init_uds_sysfs(void) +int uds_init_sysfs(void) { int result; + memset(&object_root, 0, sizeof(object_root)); kobject_init(&object_root.kobj, &empty_object_type); - result = kobject_add(&object_root.kobj, NULL, THIS_MODULE->name); + result = kobject_add(&object_root.kobj, NULL, UDS_SYSFS_NAME); if (result == 0) { object_root.flag = true; kobject_init(&object_root.parameter_kobj, @@ -200,13 +179,12 @@ int init_uds_sysfs(void) } } if (result != 0) { - put_uds_sysfs(); + uds_put_sysfs(); } return result; } -/**********************************************************************/ -void put_uds_sysfs(void) +void uds_put_sysfs(void) { if (object_root.parameter_flag) { kobject_put(&object_root.parameter_kobj); diff --git a/vdo/uds-sysfs.h b/vdo/uds-sysfs.h new file mode 100644 index 00000000..1e4c409e --- /dev/null +++ b/vdo/uds-sysfs.h @@ -0,0 +1,23 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright Red Hat + */ + +#ifndef UDS_SYSFS_H +#define UDS_SYSFS_H + +/** + * Called when the module is loaded to initialize the /sys/\ + * tree. + * + * @return 0 on success, or non-zero on error + **/ +int uds_init_sysfs(void); + +/** + * Called when the module is being unloaded to terminate the + * /sys/\ tree. + **/ +void uds_put_sysfs(void); + +#endif /* UDS_SYSFS_H */ diff --git a/uds/threadsLinuxKernel.c b/vdo/uds-threads.c similarity index 65% rename from uds/threadsLinuxKernel.c rename to vdo/uds-threads.c index 2e404710..95504b15 100644 --- a/uds/threadsLinuxKernel.c +++ b/vdo/uds-threads.c @@ -1,56 +1,66 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/krusty/kernelLinux/uds/threadsLinuxKernel.c#24 $ */ +#include "uds-threads.h" + #include #include #include +#include #include "errors.h" -#include "memoryAlloc.h" +#include "memory-alloc.h" #include "logger.h" -#include "uds-threads.h" static struct hlist_head kernel_thread_list; static struct mutex kernel_thread_mutex; -static once_state_t kernel_thread_once; +static atomic_t kernel_thread_once = ATOMIC_INIT(0); struct thread { - void (*thread_func)(void *); + void (*thread_func)(void *thread_data); void *thread_data; struct hlist_node thread_links; struct task_struct *thread_task; struct completion thread_done; }; -/**********************************************************************/ +enum { + ONCE_NOT_DONE = 0, + ONCE_IN_PROGRESS = 1, + ONCE_COMPLETE = 2, +}; + +void perform_once(atomic_t *once, void (*function)(void)) +{ + for (;;) { + switch (atomic_cmpxchg(once, ONCE_NOT_DONE, ONCE_IN_PROGRESS)) { + case ONCE_NOT_DONE: + function(); + atomic_set_release(once, ONCE_COMPLETE); + return; + case ONCE_IN_PROGRESS: + uds_yield_scheduler(); + break; + case ONCE_COMPLETE: + return; + default: + return; + } + } +} + static void kernel_thread_init(void) { mutex_init(&kernel_thread_mutex); } -/**********************************************************************/ static int thread_starter(void *arg) { struct registered_thread allocating_thread; struct thread *kt = arg; + kt->thread_task = current; perform_once(&kernel_thread_once, kernel_thread_init); mutex_lock(&kernel_thread_mutex); @@ -63,7 +73,6 @@ static int thread_starter(void *arg) return 0; } -/**********************************************************************/ int uds_create_thread(void (*thread_func)(void *), void *thread_data, const char *name, @@ -116,7 +125,7 @@ int uds_create_thread(void (*thread_func)(void *), *new_thread = kt; return UDS_SUCCESS; } -/**********************************************************************/ + int uds_join_threads(struct thread *kt) { while (wait_for_completion_interruptible(&kt->thread_done) != 0) { @@ -128,27 +137,15 @@ int uds_join_threads(struct thread *kt) return UDS_SUCCESS; } -/**********************************************************************/ -void uds_apply_to_threads(void apply_func(void *, struct task_struct *), - void *argument) -{ - struct thread *kt; - perform_once(&kernel_thread_once, kernel_thread_init); - mutex_lock(&kernel_thread_mutex); - hlist_for_each_entry (kt, &kernel_thread_list, thread_links) { - apply_func(argument, kt->thread_task); - } - mutex_unlock(&kernel_thread_mutex); -} -/**********************************************************************/ void uds_thread_exit(void) { struct thread *kt; struct completion *completion = NULL; + perform_once(&kernel_thread_once, kernel_thread_init); mutex_lock(&kernel_thread_mutex); - hlist_for_each_entry (kt, &kernel_thread_list, thread_links) { + hlist_for_each_entry(kt, &kernel_thread_list, thread_links) { if (kt->thread_task == current) { completion = &kt->thread_done; break; @@ -156,25 +153,35 @@ void uds_thread_exit(void) } mutex_unlock(&kernel_thread_mutex); uds_unregister_allocating_thread(); + +/* + * Temporary workaround for LINUX_VERSION_CODE <= KERNEL_VERSION(5,17,0). + * We have two kernels, both claiming to be version 5.17.0, that have + * different APIs. The only way to distinguish the two is to check for + * the definition of a macro that was added as part of the change that + * implemented kthread_complete_and_exit. + */ +#ifndef module_put_and_kthread_exit complete_and_exit(completion, 1); +#else + kthread_complete_and_exit(completion, 1); +#endif } -/**********************************************************************/ pid_t uds_get_thread_id(void) { return current->pid; } -/**********************************************************************/ unsigned int uds_get_num_cores(void) { return num_online_cpus(); } -/**********************************************************************/ int uds_initialize_barrier(struct barrier *barrier, unsigned int thread_count) { int result = uds_initialize_semaphore(&barrier->mutex, 1); + if (result != UDS_SUCCESS) { return result; } @@ -183,33 +190,34 @@ int uds_initialize_barrier(struct barrier *barrier, unsigned int thread_count) return uds_initialize_semaphore(&barrier->wait, 0); } -/**********************************************************************/ int uds_destroy_barrier(struct barrier *barrier) { int result = uds_destroy_semaphore(&barrier->mutex); + if (result != UDS_SUCCESS) { return result; } return uds_destroy_semaphore(&barrier->wait); } -/**********************************************************************/ int uds_enter_barrier(struct barrier *barrier, bool *winner) { bool last_thread; + uds_acquire_semaphore(&barrier->mutex); last_thread = ++barrier->arrived == barrier->thread_count; if (last_thread) { - // This is the last thread to arrive, so wake up the others + /* This is the last thread to arrive, so wake up the others */ int i; + for (i = 1; i < barrier->thread_count; i++) { uds_release_semaphore(&barrier->wait); } - // Then reinitialize for the next cycle + /* Then reinitialize for the next cycle */ barrier->arrived = 0; uds_release_semaphore(&barrier->mutex); } else { - // This is NOT the last thread to arrive, so just wait + /* This is NOT the last thread to arrive, so just wait */ uds_release_semaphore(&barrier->mutex); uds_acquire_semaphore(&barrier->wait); } @@ -219,7 +227,6 @@ int uds_enter_barrier(struct barrier *barrier, bool *winner) return UDS_SUCCESS; } -/**********************************************************************/ int uds_yield_scheduler(void) { yield(); diff --git a/uds/uds-threads.h b/vdo/uds-threads.h similarity index 82% rename from uds/uds-threads.h rename to vdo/uds-threads.h index 8e1a5892..f3596c6c 100644 --- a/uds/uds-threads.h +++ b/vdo/uds-threads.h @@ -1,37 +1,21 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/krusty/src/uds/uds-threads.h#9 $ */ #ifndef UDS_THREADS_H #define UDS_THREADS_H -#include "compiler.h" -#include "errors.h" -#include "threadOnce.h" -#include "timeUtils.h" - +#include #include #include #include #include -#include "util/eventCount.h" +#include "event-count.h" + +#include "compiler.h" +#include "errors.h" +#include "time-utils.h" struct cond_var { struct event_count *event_count; @@ -39,21 +23,12 @@ struct cond_var { struct thread; struct barrier { - struct semaphore mutex; // Mutex for this barrier object - struct semaphore wait; // Semaphore for threads waiting at the barrier - int arrived; // Number of threads which have arrived - int thread_count; // Total number of threads using this barrier + struct semaphore mutex; /* Mutex for this barrier object */ + struct semaphore wait; /* Semaphore for threads waiting at the barrier */ + int arrived; /* Number of threads which have arrived */ + int thread_count; /* Total number of threads using this barrier */ }; -/** - * Apply a function to every thread that we have created. - * - * @param apply_func The function to apply - * @param argument The first argument to apply_func - * - **/ -void uds_apply_to_threads(void apply_func(void *, struct task_struct *), - void *argument); /** * Create a thread, logging any cause of failure. @@ -88,6 +63,21 @@ unsigned int uds_get_num_cores(void); pid_t __must_check uds_get_thread_id(void); +/** + * Thread safe once only initialization. + * + * @param once_state pointer to object to record that initialization + * has been performed + * @param init_function called if once_state does not indicate + * initialization has been performed + * + * @note Generally the following declaration of once_state is performed in + * at file scope: + * + * static atomic_t once_state = ATOMIC_INIT(0); + **/ +void perform_once(atomic_t *once_state, void (*init_function) (void)); + /** * Wait for termination of another thread. * @@ -277,8 +267,10 @@ static INLINE int uds_destroy_semaphore(struct semaphore *semaphore) **/ static INLINE void uds_acquire_semaphore(struct semaphore *semaphore) { - // Do not use down(semaphore). Instead use down_interruptible so that - // we do not get 120 second stall messages in kern.log. + /* + * Do not use down(semaphore). Instead use down_interruptible so that + * we do not get 120 second stall messages in kern.log. + */ while (down_interruptible(semaphore) != 0) { /* * If we're called from a user-mode process (e.g., "dmsetup @@ -290,7 +282,7 @@ static INLINE void uds_acquire_semaphore(struct semaphore *semaphore) * still keep consuming CPU time slices and swamp other threads * trying to do computational work. [VDO-4980] */ - msleep(1); + fsleep(1000); } } @@ -313,7 +305,7 @@ __must_check uds_attempt_semaphore(struct semaphore *semaphore, ktime_t timeout) { if (timeout <= 0) { - // No timeout, just try to grab the semaphore. + /* No timeout, just try to grab the semaphore. */ return down_trylock(semaphore) == 0; } else { unsigned int jiffies = nsecs_to_jiffies(timeout); diff --git a/uds/uds.h b/vdo/uds.h similarity index 62% rename from uds/uds.h rename to vdo/uds.h index e056efd5..ad46c168 100644 --- a/uds/uds.h +++ b/vdo/uds.h @@ -1,22 +1,6 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/krusty/src/uds/uds.h#37 $ */ /** @@ -31,9 +15,10 @@ #ifndef UDS_H #define UDS_H +#include + #include "compiler.h" -#include "uds-platform.h" -#include "util/funnelQueue.h" +#include "funnel-queue.h" /** * Valid request types. @@ -62,10 +47,15 @@ enum uds_request_type { /** * Request type for operations that query mappings in the UDS - * index. When a mapping is found, the recency of the mapping - * is updated unless it's the no-update call. + * index. The recency of the mapping is updated. **/ UDS_QUERY, + + /** + * Request type for operations that query mappings in the UDS + * index without updating the recency of the mapping. + **/ + UDS_QUERY_NO_UPDATE, }; /** @@ -99,27 +89,26 @@ enum { /** * Type representing memory configuration which is either a positive - * integer number of gigabytes or one of the three special constants + * integer number of gigabytes or one of the six special constants * for configurations which are smaller than 1 gigabyte. **/ -typedef unsigned int uds_memory_config_size_t; +typedef int uds_memory_config_size_t; -extern const uds_memory_config_size_t UDS_MEMORY_CONFIG_256MB; -extern const uds_memory_config_size_t UDS_MEMORY_CONFIG_512MB; -extern const uds_memory_config_size_t UDS_MEMORY_CONFIG_768MB; -/** - * The maximum configurable amount of memory. - **/ -extern const uds_memory_config_size_t UDS_MEMORY_CONFIG_MAX; - -/** - * Memory size constants for volumes that have one less chapter - */ -extern const uds_memory_config_size_t UDS_MEMORY_CONFIG_REDUCED_256MB; -extern const uds_memory_config_size_t UDS_MEMORY_CONFIG_REDUCED_512MB; -extern const uds_memory_config_size_t UDS_MEMORY_CONFIG_REDUCED_768MB; -extern const uds_memory_config_size_t UDS_MEMORY_CONFIG_REDUCED; -extern const uds_memory_config_size_t UDS_MEMORY_CONFIG_REDUCED_MAX; +enum { + /* The maximum configurable amount of memory. */ + UDS_MEMORY_CONFIG_MAX = 1024, + /* Flag indicating volume has one less chapter than usual */ + UDS_MEMORY_CONFIG_REDUCED = 0x1000, + /* Flag indicating volume has one less chapter than usual */ + UDS_MEMORY_CONFIG_REDUCED_MAX = 1024 + UDS_MEMORY_CONFIG_REDUCED, + /* Special values indicating sizes less than 1 GB */ + UDS_MEMORY_CONFIG_256MB = -256, + UDS_MEMORY_CONFIG_512MB = -512, + UDS_MEMORY_CONFIG_768MB = -768, + UDS_MEMORY_CONFIG_REDUCED_256MB = -1280, + UDS_MEMORY_CONFIG_REDUCED_512MB = -1536, + UDS_MEMORY_CONFIG_REDUCED_768MB = -1792, +}; /** The name (hash) of a chunk. */ struct uds_chunk_name { @@ -142,25 +131,24 @@ struct uds_index_session; /** * The data used to configure a new index. **/ -struct uds_configuration; -typedef uint64_t uds_nonce_t; - -/** - * The data used to configure a new index session. - **/ struct uds_parameters { - // Tne number of threads used to process index requests. - int zone_count; - // The number of threads used to read volume pages. - int read_threads; - // The number of chapters to write between checkpoints. - int checkpoint_frequency; + /** String describing the storage device */ + const char *name; + /** The maximum allowable size of the index on storage */ + size_t size; + /** The offset where the index should start */ + off_t offset; + /** The maximum memory allocation, in GB */ + uds_memory_config_size_t memory_size; + /** Whether the index should include sparse chapters */ + bool sparse; + /** A 64-bit nonce to validate the index */ + uint64_t nonce; + /** The number of threads used to process index requests */ + unsigned int zone_count; + /** The number of threads used to read volume pages */ + unsigned int read_threads; }; -#define UDS_PARAMETERS_INITIALIZER { \ - .zone_count = 0, \ - .read_threads = 2, \ - .checkpoint_frequency = 0, \ - } /** * Index statistics @@ -227,15 +215,19 @@ struct uds_index; * The block's general location in the index. **/ enum uds_index_region { - /* the location isn't known yet */ + /* no location information has been determined */ UDS_LOCATION_UNKNOWN = 0, + /* the index page entry has been found */ + UDS_LOCATION_INDEX_PAGE_LOOKUP, + /* the record page entry has been found */ + UDS_LOCATION_RECORD_PAGE_LOOKUP, /* the block is not in the index */ UDS_LOCATION_UNAVAILABLE, - /* if the block was found in the open chapter */ + /* the block was found in the open chapter */ UDS_LOCATION_IN_OPEN_CHAPTER, - /* if the block was found in the dense part of the index */ + /* the block was found in the dense part of the index */ UDS_LOCATION_IN_DENSE, - /* if the block was found in the sparse part of the index */ + /* the block was found in the sparse part of the index */ UDS_LOCATION_IN_SPARSE } __packed; @@ -255,8 +247,6 @@ enum uds_zone_message_type { struct uds_zone_message { /** The type of message, determining how it will be processed */ enum uds_zone_message_type type; - /** The index to which the message is directed */ - struct uds_index *index; /** The virtual chapter number to which the message applies */ uint64_t virtual_chapter; }; @@ -294,7 +284,7 @@ struct uds_request { /* * The new metadata to associate with the name of the block (sometimes * called the duplicate address). Set before starting a #UDS_POST or - * #UDS_QUERY operation. Unchanged at time of callback. + * #UDS_UPDATE operation. Unchanged at time of callback. */ struct uds_chunk_data new_metadata; /* @@ -310,8 +300,9 @@ struct uds_request { */ struct uds_index_session *session; /* - * The operation type, which is one of #UDS_DELETE, #UDS_POST, - * #UDS_QUERY or #UDS_UPDATE. Set before starting an operation. + * The operation type, which is one of #UDS_POST, #UDS_UPDATE, + * #UDS_DELETE, #UDS_QUERY or #UDS_QUERY_NO_UPDATE. + * Set before starting an operation. * Unchanged at time of callback. */ enum uds_request_type type; @@ -325,12 +316,6 @@ struct uds_request { * Set before the callback. */ bool found; - /* - * If true, move the entry to the end of the deduplication window. - * Set before starting a #UDS_QUERY operation. - * Unchanged at time of callback. - */ - bool update; /* * The remainder of this structure consists of fields used within the @@ -353,101 +338,24 @@ struct uds_request { bool unbatched; /** If true, attempt to handle this request before newer requests */ bool requeued; + /** The virtual chapter containing the record */ + uint64_t virtual_chapter; /** The location of this chunk name in the index */ enum uds_index_region location; }; -/** - * Initializes an index configuration. - * - * @param [out] conf The new configuration - * @param [in] mem_gb The maximum memory allocation, in GB - * - * @return Either #UDS_SUCCESS or an error code - **/ -int __must_check uds_initialize_configuration(struct uds_configuration **conf, - uds_memory_config_size_t mem_gb); - -/** - * Sets or clears an index configuration's sparse indexing settings. - * - * @param [in,out] conf The configuration to change - * @param [in] sparse If true, request a sparse - * index; if false, request - * a default index. - * - **/ -void uds_configuration_set_sparse(struct uds_configuration *conf, bool sparse); - -/** - * Tests whether an index configuration specifies sparse indexing. - * - * @param [in] conf The configuration to check - * - * @return Returns true if the configuration - * is sparse, or false if not - **/ -bool __must_check uds_configuration_get_sparse(struct uds_configuration *conf); - -/** - * Sets an index configuration's nonce. - * - * @param [in,out] conf The configuration to change - * @param [in] nonce The 64 bit nonce. - * - **/ -void uds_configuration_set_nonce(struct uds_configuration *conf, - uds_nonce_t nonce); - -/** - * Gets an index configuration's nonce. - * - * @param [in] conf The configuration to check - * - * @return The 64 bit nonce. - **/ -uds_nonce_t __must_check -uds_configuration_get_nonce(struct uds_configuration *conf); - -/** - * Fetches a configuration's maximum memory allocation. - * - * @param [in] conf The configuration to check - * - * @return The amount of memory allocated, in GB - **/ -uds_memory_config_size_t __must_check -uds_configuration_get_memory(struct uds_configuration *conf); - -/** - * Fetches a configuration's chapters per volume value. - * - * @param [in] conf The configuration to check - * - * @return The number of chapters per volume - **/ -unsigned int __must_check -uds_configuration_get_chapters_per_volume(struct uds_configuration *conf); - -/** - * Frees memory used by a configuration. - * - * @param [in,out] conf The configuration for which memory is being freed - **/ -void uds_free_configuration(struct uds_configuration *conf); - /** * Compute the size required to store the index on persistent storage. This * size is valid for any index stored in a single file or on a single block * device. This size should be used when configuring a block device on which * to store an index. * - * @param [in] config A uds_configuration for an index. + * @param [in] parameters Parameters for an index. * @param [out] index_size The number of bytes required to store the index. * * @return UDS_SUCCESS or an error code. **/ -int __must_check uds_compute_index_size(const struct uds_configuration *config, +int __must_check uds_compute_index_size(const struct uds_parameters *parameters, uint64_t *index_size); /** @@ -464,43 +372,21 @@ int __must_check uds_compute_index_size(const struct uds_configuration *config, **/ int __must_check uds_create_index_session(struct uds_index_session **session); -/** - * Fetches the UDS library version. - * - * @return The library version - **/ -const char * __must_check uds_get_version(void); - -/** - * The name argument to #uds_open_index is a text string that names the index. - * The name should have the form "path", where path is the name of the block - * device. The path should not contain white space. The names can optionally - * contain size and/or offset options which give the number of bytes in the - * index and the byte offset to the start of the index. For example, the name - * "/dev/sda8 offset=409600 size=2048000000" is an index that is stored in - * 2040000000 bytes of /dev/sda8 starting at byte 409600. - **/ - /** * Opens an index with an existing session. This operation will fail if the * index session is suspended, or if there is already an open index. * * The index should be closed with #uds_close_index. * - * @param open_type The type of open, which is one of #UDS_LOAD, #UDS_CREATE, - * or #UDS_NO_REBUILD. - * @param name The name of the index - * @param params The index session parameters. If NULL, the default - * session parameters will be used. - * @param conf The index configuration - * @param session The index session + * @param open_type The type of open, which is one of #UDS_LOAD, #UDS_CREATE, + * or #UDS_NO_REBUILD. + * @param parameters The index parameters + * @param session The index session * * @return Either #UDS_SUCCESS or an error code **/ int __must_check uds_open_index(enum uds_open_index_type open_type, - const char *name, - const struct uds_parameters *params, - struct uds_configuration *conf, + const struct uds_parameters *parameters, struct uds_index_session *session); /** @@ -565,15 +451,16 @@ int __must_check uds_close_index(struct uds_index_session *session); int uds_destroy_index_session(struct uds_index_session *session); /** - * Returns the configuration for the given index session. + * Returns the parameters for the given index session. The caller is + * responsible for freeing the returned structure. * - * @param [in] session The session - * @param [out] conf The index configuration + * @param [in] session The session + * @param [out] parameters A copy of the index parameters * * @return Either #UDS_SUCCESS or an error code **/ -int __must_check uds_get_index_configuration(struct uds_index_session *session, - struct uds_configuration **conf); +int __must_check uds_get_index_parameters(struct uds_index_session *session, + struct uds_parameters **parameters); /** * Fetches index statistics for the given index session. @@ -586,37 +473,16 @@ int __must_check uds_get_index_configuration(struct uds_index_session *session, int __must_check uds_get_index_stats(struct uds_index_session *session, struct uds_index_stats *stats); -/** - * Convert an error code to a string. - * - * @param errnum The error code - * @param buf The buffer to hold the error string - * @param buflen The length of the buffer - * - * @return A pointer to buf - **/ -const char * __must_check uds_string_error(int errnum, - char *buf, - size_t buflen); - -/** - * Suggested buffer size for uds_string_error. - **/ -enum { UDS_STRING_ERROR_BUFSIZE = 128 }; - /** @{ */ /** @name Deduplication */ /** - * Start a UDS index chunk operation. The request type field must - * be set to the type of operation. This is an asynchronous interface to the - * block-oriented UDS API. The callback is invoked upon completion. - * - * The #UDS_DELETE operation type deletes the mapping for a particular block. - * #UDS_DELETE is typically used when UDS provides invalid advice. + * Start a UDS index chunk operation. The request type field must + * be set to the type of operation. This is an asynchronous interface to the + * block-oriented UDS API. The callback is invoked upon completion. * * The #UDS_POST operation type indexes a block name and associates it with a - * particular address. The caller provides the block's name. UDS then checks + * particular address. The caller provides the block's name. UDS then checks * this name against its index. *
    *
  • If the block is new, it is stored in the index.
  • @@ -624,20 +490,29 @@ enum { UDS_STRING_ERROR_BUFSIZE = 128 }; * canonical block address via the callback. *
* + * The #UDS_UPDATE operation type updates the mapping for a particular block. + * #UDS_UPDATE is typically used if the callback function provides invalid + * advice. + * + * The #UDS_DELETE operation type deletes the mapping for a particular block. + * #UDS_DELETE is typically used when UDS provides invalid advice. + * * The #UDS_QUERY operation type checks to see if a block name exists in the - * index. The caller provides the block's name. UDS then checks - * this name against its index. + * index. The caller provides the block's name. UDS then checks this name + * against its index. *
    *
  • If the block is new, no action is taken.
  • - *
  • If the block is a duplicate of an indexed block, UDS returns the - * canonical block address via the callback. If the update - * field is set, the entry is moved to the end of the deduplication - * window.
+ * canonical block address via the callback and the entry is moved to + * the end of the deduplication window. * - * The #UDS_UPDATE operation type updates the mapping for a particular block. - * #UDS_UPDATE is typically used if the callback function provides invalid - * advice. + * The #UDS_QUERY_NO_UPDATE operation type checks to see if a block name exists + * in the index. The caller provides the block's name. UDS then checks this + * name against its index. + *
    + *
  • If the block is new, no action is taken.
  • + *
  • If the block is a duplicate of an indexed block, UDS returns the + * canonical block address via the callback. * * @param [in] request The operation. The type, * chunk_name, new_metadata, diff --git a/vdo/vdo-component-states.c b/vdo/vdo-component-states.c new file mode 100644 index 00000000..49b76daf --- /dev/null +++ b/vdo/vdo-component-states.c @@ -0,0 +1,233 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright Red Hat + */ + +#include "vdo-component-states.h" + +#include "logger.h" +#include "memory-alloc.h" +#include "permassert.h" + +#include "block-map-format.h" +#include "constants.h" +#include "num-utils.h" +#include "recovery-journal-format.h" +#include "slab-depot-format.h" +#include "status-codes.h" +#include "types.h" +#include "vdo-component.h" +#include "vdo-layout.h" + +const struct version_number VDO_VOLUME_VERSION_67_0 = { + .major_version = 67, + .minor_version = 0, +}; + +/** + * vdo_destroy_component_states() - Clean up any allocations in a + * vdo_component_states. + * @states: The component states to destroy. + */ +void vdo_destroy_component_states(struct vdo_component_states *states) +{ + if (states == NULL) { + return; + } + + vdo_free_fixed_layout(UDS_FORGET(states->layout)); +} + +/** + * decode_components() - Decode the components now that we know the component + * data is a version we understand. + * @buffer: The buffer being decoded. + * @states: An object to hold the successfully decoded state. + * + * Return: VDO_SUCCESS or an error. + */ +static int __must_check +decode_components(struct buffer *buffer, struct vdo_component_states *states) +{ + int result = vdo_decode_component(buffer, &states->vdo); + + if (result != VDO_SUCCESS) { + return result; + } + + result = vdo_decode_fixed_layout(buffer, &states->layout); + if (result != VDO_SUCCESS) { + return result; + } + + result = vdo_decode_recovery_journal_state_7_0(buffer, + &states->recovery_journal); + if (result != VDO_SUCCESS) { + return result; + } + + result = vdo_decode_slab_depot_state_2_0(buffer, &states->slab_depot); + if (result != VDO_SUCCESS) { + return result; + } + + result = vdo_decode_block_map_state_2_0(buffer, &states->block_map); + if (result != VDO_SUCCESS) { + return result; + } + + ASSERT_LOG_ONLY((content_length(buffer) == 0), + "All decoded component data was used"); + return VDO_SUCCESS; +} + +/** + * vdo_decode_component_states() - Decode the payload of a super block. + * @buffer: The buffer containing the encoded super block contents. + * @expected_release_version: The required release version. + * @states: A pointer to hold the decoded states. + * + * Return: VDO_SUCCESS or an error. + */ +int vdo_decode_component_states(struct buffer *buffer, + release_version_number_t expected_release_version, + struct vdo_component_states *states) +{ + /* Check the release version against the one from the geometry. */ + int result = get_uint32_le_from_buffer(buffer, + &states->release_version); + if (result != VDO_SUCCESS) { + return result; + } + + if (states->release_version != expected_release_version) { + return uds_log_error_strerror(VDO_UNSUPPORTED_VERSION, + "Geometry release version %u does not match super block release version %u", + expected_release_version, + states->release_version); + } + + /* Check the VDO volume version */ + result = vdo_decode_version_number(buffer, &states->volume_version); + if (result != VDO_SUCCESS) { + return result; + } + + result = vdo_validate_version(VDO_VOLUME_VERSION_67_0, + states->volume_version, + "volume"); + if (result != VDO_SUCCESS) { + return result; + } + + result = decode_components(buffer, states); + if (result != VDO_SUCCESS) { + vdo_free_fixed_layout(UDS_FORGET(states->layout)); + return result; + } + + return VDO_SUCCESS; +} + +/** + * vdo_validate_component_states() - Validate the decoded super block + * configuration. + * @states: The state decoded from the super block. + * @geometry_nonce: The nonce from the geometry block. + * @physical_size: The minimum block count of the underlying storage. + * @logical_size: The expected logical size of the VDO, or 0 if the + * logical size may be unspecified. + * + * Return: VDO_SUCCESS or an error if the configuration is invalid. + */ +int vdo_validate_component_states(struct vdo_component_states *states, + nonce_t geometry_nonce, + block_count_t physical_size, + block_count_t logical_size) +{ + if (geometry_nonce != states->vdo.nonce) { + return uds_log_error_strerror(VDO_BAD_NONCE, + "Geometry nonce %llu does not match superblock nonce %llu", + (unsigned long long) geometry_nonce, + (unsigned long long) states->vdo.nonce); + } + + return vdo_validate_config(&states->vdo.config, + physical_size, + logical_size); +} + +/** + * get_component_data_size() - Get the component data size of a vdo. + * @layout: The layout of the vdo. + * + * Return: The component data size of the vdo. + */ +static size_t __must_check get_component_data_size(struct fixed_layout *layout) +{ + return (sizeof(release_version_number_t) + + sizeof(struct packed_version_number) + + vdo_get_component_encoded_size() + + vdo_get_fixed_layout_encoded_size(layout) + + vdo_get_recovery_journal_encoded_size() + + vdo_get_slab_depot_encoded_size() + + vdo_get_block_map_encoded_size()); +} + +/** + * vdo_encode_component_states() - Encode the state of all vdo components for + * writing in the super block. + * @buffer: The buffer to encode into. + * @states: The states to encode. + */ +int vdo_encode_component_states(struct buffer *buffer, + const struct vdo_component_states *states) +{ + size_t expected_size; + int result = reset_buffer_end(buffer, 0); + + if (result != UDS_SUCCESS) { + return result; + } + + result = put_uint32_le_into_buffer(buffer, states->release_version); + if (result != UDS_SUCCESS) { + return result; + } + + result = vdo_encode_version_number(states->volume_version, buffer); + if (result != VDO_SUCCESS) { + return result; + } + + result = vdo_encode_component(states->vdo, buffer); + if (result != VDO_SUCCESS) { + return result; + } + + result = vdo_encode_fixed_layout(states->layout, buffer); + if (result != VDO_SUCCESS) { + return result; + } + + result = vdo_encode_recovery_journal_state_7_0(states->recovery_journal, + buffer); + if (result != VDO_SUCCESS) { + return result; + } + + result = vdo_encode_slab_depot_state_2_0(states->slab_depot, buffer); + if (result != VDO_SUCCESS) { + return result; + } + + result = vdo_encode_block_map_state_2_0(states->block_map, buffer); + if (result != VDO_SUCCESS) { + return result; + } + + expected_size = get_component_data_size(states->layout); + ASSERT_LOG_ONLY((content_length(buffer) == expected_size), + "All super block component data was encoded"); + return VDO_SUCCESS; +} diff --git a/vdo/vdo-component-states.h b/vdo/vdo-component-states.h new file mode 100644 index 00000000..9e49415f --- /dev/null +++ b/vdo/vdo-component-states.h @@ -0,0 +1,71 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright Red Hat + */ + +#ifndef VDO_COMPONENT_STATES_H +#define VDO_COMPONENT_STATES_H + +#include "block-map-format.h" +#include "recovery-journal-format.h" +#include "slab-depot-format.h" +#include "types.h" +#include "vdo-component.h" +#include "vdo-layout.h" + +/* + * The version of the on-disk format of a VDO volume. This should be + * incremented any time the on-disk representation of any VDO structure + * changes. Changes which require only online upgrade steps should increment + * the minor version. Changes which require an offline upgrade or which can not + * be upgraded to at all should increment the major version and set the minor + * version to 0. + */ +extern const struct version_number VDO_VOLUME_VERSION_67_0; + +/* + * The entirety of the component data encoded in the VDO super block. + */ +struct vdo_component_states { + /* The release version */ + release_version_number_t release_version; + + /* The VDO volume version */ + struct version_number volume_version; + + /* Components */ + struct vdo_component vdo; + struct block_map_state_2_0 block_map; + struct recovery_journal_state_7_0 recovery_journal; + struct slab_depot_state_2_0 slab_depot; + + /* Our partitioning of the underlying storage */ + struct fixed_layout *layout; +}; + +void vdo_destroy_component_states(struct vdo_component_states *states); + +int __must_check +vdo_decode_component_states(struct buffer *buffer, + release_version_number_t expected_release_version, + struct vdo_component_states *states); + +int __must_check +vdo_validate_component_states(struct vdo_component_states *states, + nonce_t geometry_nonce, + block_count_t physical_size, + block_count_t logical_size); + +/** + * vdo_encode() - Encode a VDO super block into a buffer for writing in the + * super block. + * @buffer: The buffer to encode into. + * @states: The states of the vdo to be encoded. + */ +int __must_check +vdo_encode(struct buffer *buffer, struct vdo_component_states *states); + +int vdo_encode_component_states(struct buffer *buffer, + const struct vdo_component_states *states); + +#endif /* VDO_COMPONENT_STATES_H */ diff --git a/vdo/vdoComponent.c b/vdo/vdo-component.c similarity index 59% rename from vdo/vdoComponent.c rename to vdo/vdo-component.c index 7cf4c184..ac1ac1f7 100644 --- a/vdo/vdoComponent.c +++ b/vdo/vdo-component.c @@ -1,25 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/vdoComponent.c#13 $ */ -#include "vdoComponent.h" +#include "vdo-component.h" #include "buffer.h" #include "logger.h" @@ -27,25 +11,25 @@ #include "constants.h" #include "header.h" -#include "numUtils.h" -#include "slabDepotFormat.h" -#include "statusCodes.h" +#include "num-utils.h" +#include "slab-depot-format.h" +#include "status-codes.h" #include "types.h" -/** +/* * The current version for the data encoded in the super block. This must * be changed any time there is a change to encoding of the component data * of any VDO component. - **/ + */ static const struct version_number VDO_COMPONENT_DATA_41_0 = { .major_version = 41, .minor_version = 0, }; -/** +/* * A packed, machine-independent, on-disk representation of the vdo_config * in the VDO component data in the super block. - **/ + */ struct packed_vdo_config { __le64 logical_blocks; __le64 physical_blocks; @@ -54,10 +38,10 @@ struct packed_vdo_config { __le64 slab_journal_blocks; } __packed; -/** +/* * A packed, machine-independent, on-disk representation of version 41.0 * of the VDO component data in the super block. - **/ + */ struct packed_vdo_component_41_0 { __le32 state; __le64 complete_recoveries; @@ -66,20 +50,25 @@ struct packed_vdo_component_41_0 { __le64 nonce; } __packed; -/**********************************************************************/ -size_t get_vdo_component_encoded_size(void) +/** + * vdo_get_component_encoded_size() - Get the size of the encoded state of the + * vdo itself. + * + * Return: The encoded size of the vdo's state. + */ +size_t vdo_get_component_encoded_size(void) { return (sizeof(struct packed_version_number) + sizeof(struct packed_vdo_component_41_0)); } /** - * Convert a vdo_config to its packed on-disk representation. - * - * @param config The vdo config to convert + * pack_vdo_config() - Convert a vdo_config to its packed on-disk + * representation. + * @config: The vdo config to convert. * - * @return the platform-independent representation of the config - **/ + * Return: The platform-independent representation of the config. + */ static inline struct packed_vdo_config pack_vdo_config(struct vdo_config config) { @@ -95,12 +84,12 @@ pack_vdo_config(struct vdo_config config) } /** - * Convert a vdo_component to its packed on-disk representation. - * - * @param component The VDO component data to convert + * pack_vdo_component() - Convert a vdo_component to its packed on-disk + * representation. + * @component: The VDO component data to convert. * - * @return the platform-independent representation of the component - **/ + * Return: The platform-independent representation of the component. + */ static inline struct packed_vdo_component_41_0 pack_vdo_component(const struct vdo_component component) { @@ -115,13 +104,19 @@ pack_vdo_component(const struct vdo_component component) }; } -/**********************************************************************/ -int encode_vdo_component(struct vdo_component component, struct buffer *buffer) +/** + * vdo_encode_component() - Encode the component data for the vdo itself. + * @component: The component structure. + * @buffer: The buffer in which to encode the vdo. + * + * Return: VDO_SUCCESS or an error. + */ +int vdo_encode_component(struct vdo_component component, struct buffer *buffer) { int result; struct packed_vdo_component_41_0 packed; - result = encode_vdo_version_number(VDO_COMPONENT_DATA_41_0, buffer); + result = vdo_encode_version_number(VDO_COMPONENT_DATA_41_0, buffer); if (result != VDO_SUCCESS) { return result; } @@ -131,12 +126,12 @@ int encode_vdo_component(struct vdo_component component, struct buffer *buffer) } /** - * Convert a packed_vdo_config to its native in-memory representation. - * - * @param config The packed vdo config to convert + * unpack_vdo_config() - Convert a packed_vdo_config to its native in-memory + * representation. + * @config: The packed vdo config to convert. * - * @return the native in-memory representation of the vdo config - **/ + * Return: The native in-memory representation of the vdo config. + */ static inline struct vdo_config unpack_vdo_config(struct packed_vdo_config config) { @@ -152,12 +147,12 @@ unpack_vdo_config(struct packed_vdo_config config) } /** - * Convert a packed_vdo_component_41_0 to its native in-memory representation. - * - * @param component The packed vdo component data to convert + * unpack_vdo_component_41_0() - Convert a packed_vdo_component_41_0 to its + * native in-memory representation. + * @component: The packed vdo component data to convert. * - * @return the native in-memory representation of the component - **/ + * Return: The native in-memory representation of the component. + */ static inline struct vdo_component unpack_vdo_component_41_0(struct packed_vdo_component_41_0 component) { @@ -173,19 +168,20 @@ unpack_vdo_component_41_0(struct packed_vdo_component_41_0 component) } /** - * Decode the version 41.0 component data for the vdo itself from a buffer. - * - * @param buffer A buffer positioned at the start of the encoding - * @param component The component structure to receive the decoded values + * vdo_decode_component_41_0() - Decode the version 41.0 component data for + * the vdo itself from a buffer. + * @buffer: A buffer positioned at the start of the encoding. + * @component: The component structure to receive the decoded values. * - * @return VDO_SUCCESS or an error - **/ + * Return: VDO_SUCCESS or an error. + */ static int __must_check -decode_vdo_component_41_0(struct buffer *buffer, +vdo_decode_component_41_0(struct buffer *buffer, struct vdo_component *component) { struct packed_vdo_component_41_0 packed; int result = get_bytes_from_buffer(buffer, sizeof(packed), &packed); + if (result != UDS_SUCCESS) { return result; } @@ -194,32 +190,50 @@ decode_vdo_component_41_0(struct buffer *buffer, return VDO_SUCCESS; } -/**********************************************************************/ -int decode_vdo_component(struct buffer *buffer, +/** + * vdo_decode_component() - Decode the component data for the vdo itself from + * the component data buffer in the super block. + * @buffer: The buffer being decoded. + * @component: The component structure in which to store the result of a + * successful decode. + * + * Return: VDO_SUCCESS or an error. + */ +int vdo_decode_component(struct buffer *buffer, struct vdo_component *component) { struct version_number version; - int result = decode_vdo_version_number(buffer, &version); + int result = vdo_decode_version_number(buffer, &version); + if (result != VDO_SUCCESS) { return result; } - result = validate_vdo_version(version, VDO_COMPONENT_DATA_41_0, + result = vdo_validate_version(version, VDO_COMPONENT_DATA_41_0, "VDO component data"); if (result != VDO_SUCCESS) { return result; } - return decode_vdo_component_41_0(buffer, component); + return vdo_decode_component_41_0(buffer, component); } -/**********************************************************************/ -int validate_vdo_config(const struct vdo_config *config, - block_count_t block_count, - bool require_logical) +/** + * vdo_validate_config() - Validate constraints on a VDO config. + * @config: The VDO config. + * @physical_block_count: The minimum block count of the underlying storage. + * @logical_block_count: The expected logical size of the VDO, or 0 if the + * logical size may be unspecified. + * + * Return: A success or error code. + */ +int vdo_validate_config(const struct vdo_config *config, + block_count_t physical_block_count, + block_count_t logical_block_count) { struct slab_config slab_config; int result = ASSERT(config->slab_size > 0, "slab size unspecified"); + if (result != UDS_SUCCESS) { return result; } @@ -249,7 +263,7 @@ int validate_vdo_config(const struct vdo_config *config, return result; } - result = configure_vdo_slab(config->slab_size, + result = vdo_configure_slab(config->slab_size, config->slab_journal_blocks, &slab_config); if (result != VDO_SUCCESS) { @@ -276,19 +290,26 @@ int validate_vdo_config(const struct vdo_config *config, return VDO_OUT_OF_RANGE; } - // This can't check equality because FileLayer et al can only known - // about the storage size, which may not match the super block size. - if (block_count < config->physical_blocks) { - uds_log_error("A physical size of %llu blocks was specified, but that is smaller than the %llu blocks configured in the vdo super block", - (unsigned long long) block_count, + if (physical_block_count != config->physical_blocks) { + uds_log_error("A physical size of %llu blocks was specified, not the %llu blocks configured in the vdo super block", + (unsigned long long) physical_block_count, (unsigned long long) config->physical_blocks); return VDO_PARAMETER_MISMATCH; } - result = ASSERT(!require_logical || (config->logical_blocks > 0), - "logical blocks unspecified"); - if (result != UDS_SUCCESS) { - return result; + if (logical_block_count > 0) { + result = ASSERT((config->logical_blocks > 0), + "logical blocks unspecified"); + if (result != UDS_SUCCESS) { + return result; + } + + if (logical_block_count != config->logical_blocks) { + uds_log_error("A logical size of %llu blocks was specified, but that differs from the %llu blocks configured in the vdo super block", + (unsigned long long) logical_block_count, + (unsigned long long) config->logical_blocks); + return VDO_PARAMETER_MISMATCH; + } } result = ASSERT(config->logical_blocks <= MAXIMUM_VDO_LOGICAL_BLOCKS, diff --git a/vdo/vdo-component.h b/vdo/vdo-component.h new file mode 100644 index 00000000..89f5c11f --- /dev/null +++ b/vdo/vdo-component.h @@ -0,0 +1,48 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright Red Hat + */ + +#ifndef VDO_COMPONENT_H +#define VDO_COMPONENT_H + +#include "buffer.h" + +#include "types.h" + +/* + * The configuration of the VDO service. + */ +struct vdo_config { + block_count_t logical_blocks; /* number of logical blocks */ + block_count_t physical_blocks; /* number of physical blocks */ + block_count_t slab_size; /* number of blocks in a slab */ + block_count_t recovery_journal_size; /* number of recovery journal blocks */ + block_count_t slab_journal_blocks; /* number of slab journal blocks */ +}; + +/* + * This is the structure that captures the vdo fields saved as a super block + * component. + */ +struct vdo_component { + enum vdo_state state; + uint64_t complete_recoveries; + uint64_t read_only_recoveries; + struct vdo_config config; + nonce_t nonce; +}; + +size_t __must_check vdo_get_component_encoded_size(void); + +int __must_check +vdo_encode_component(struct vdo_component component, struct buffer *buffer); + +int __must_check +vdo_decode_component(struct buffer *buffer, struct vdo_component *component); + +int vdo_validate_config(const struct vdo_config *config, + block_count_t physical_block_count, + block_count_t logical_block_count); + +#endif /* VDO_COMPONENT_H */ diff --git a/vdo/vdo-layout.c b/vdo/vdo-layout.c new file mode 100644 index 00000000..30fc800c --- /dev/null +++ b/vdo/vdo-layout.c @@ -0,0 +1,1142 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright Red Hat + */ + +#include "vdo-layout.h" + +#include + +#include "memory-alloc.h" +#include "permassert.h" + +#include "constants.h" +#include "header.h" +#include "status-codes.h" +#include "types.h" + +#include "vdo.h" + +const block_count_t VDO_ALL_FREE_BLOCKS = (uint64_t) -1; + +struct fixed_layout { + physical_block_number_t first_free; + physical_block_number_t last_free; + size_t num_partitions; + struct partition *head; +}; + +struct partition { + enum partition_id id; /* The id of this partition */ + struct fixed_layout *layout; /* The layout to which this partition */ + /* belongs */ + physical_block_number_t offset; /* The offset into the layout of this */ + /* partition */ + physical_block_number_t base; /* The untranslated number of the first block */ + block_count_t count; /* The number of blocks in the partition */ + struct partition *next; /* A pointer to the next partition in the layout */ +}; + +struct layout_3_0 { + physical_block_number_t first_free; + physical_block_number_t last_free; + byte partition_count; +} __packed; + +struct partition_3_0 { + enum partition_id id; + physical_block_number_t offset; + physical_block_number_t base; + block_count_t count; +} __packed; + +static const struct header LAYOUT_HEADER_3_0 = { + .id = VDO_FIXED_LAYOUT, + .version = { + .major_version = 3, + .minor_version = 0, + }, + .size = sizeof(struct layout_3_0), /* Minimum size */ + /* (contains no partitions) */ +}; + +/** + * vdo_make_fixed_layout() - Make an unpartitioned fixed layout. + * @total_blocks: The total size of the layout, in blocks. + * @start_offset: The block offset in the underlying layer at which the fixed + * layout begins. + * @layout_ptr: The pointer to hold the resulting layout. + * + * Return: A success or error code. + */ +int vdo_make_fixed_layout(block_count_t total_blocks, + physical_block_number_t start_offset, + struct fixed_layout **layout_ptr) +{ + struct fixed_layout *layout; + int result = UDS_ALLOCATE(1, struct fixed_layout, "fixed layout", &layout); + + if (result != UDS_SUCCESS) { + return result; + } + + layout->first_free = start_offset; + layout->last_free = start_offset + total_blocks; + layout->num_partitions = 0; + layout->head = NULL; + + *layout_ptr = layout; + return VDO_SUCCESS; +} + +/** + * vdo_free_fixed_layout() - Free a fixed layout. + * @layout: The layout to free. + * + * All partitions created by this layout become invalid pointers. + */ +void vdo_free_fixed_layout(struct fixed_layout *layout) +{ + if (layout == NULL) { + return; + } + + while (layout->head != NULL) { + struct partition *part = layout->head; + + layout->head = part->next; + UDS_FREE(part); + } + + UDS_FREE(layout); +} + +/** + * vdo_get_total_fixed_layout_size() - Get the total size of the layout in + * blocks. + * @layout: The layout. + * + * Return: The size of the layout. + */ +block_count_t vdo_get_total_fixed_layout_size(const struct fixed_layout *layout) +{ + block_count_t size = vdo_get_fixed_layout_blocks_available(layout); + struct partition *partition; + + for (partition = layout->head; partition != NULL; + partition = partition->next) { + size += partition->count; + } + + return size; +} + +/** + * vdo_get_fixed_layout_partition() - Get a partition by id. + * @layout: The layout from which to get a partition. + * @id: The id of the partition. + * @partition_ptr: A pointer to hold the partition. + * + * Return: VDO_SUCCESS or an error. + */ +int vdo_get_fixed_layout_partition(struct fixed_layout *layout, + enum partition_id id, + struct partition **partition_ptr) +{ + struct partition *partition; + + for (partition = layout->head; partition != NULL; + partition = partition->next) { + if (partition->id == id) { + if (partition_ptr != NULL) { + *partition_ptr = partition; + } + return VDO_SUCCESS; + } + } + + return VDO_UNKNOWN_PARTITION; +} + +/** + * vdo_translate_to_pbn() - Translate a block number from the partition's view + * to the layer's + * @partition: The partition to use for translation. + * @partition_block_number: The block number relative to the partition. + * @layer_block_number: The block number relative to the layer. + * + * Return: VDO_SUCCESS or an error code. + */ +int vdo_translate_to_pbn(const struct partition *partition, + physical_block_number_t partition_block_number, + physical_block_number_t *layer_block_number) +{ + physical_block_number_t offset_from_base; + + if (partition == NULL) { + *layer_block_number = partition_block_number; + return VDO_SUCCESS; + } + + if (partition_block_number < partition->base) { + return VDO_OUT_OF_RANGE; + } + + offset_from_base = partition_block_number - partition->base; + if (offset_from_base >= partition->count) { + return VDO_OUT_OF_RANGE; + } + + *layer_block_number = partition->offset + offset_from_base; + return VDO_SUCCESS; +} + +/** + * vdo_translate_from_pbn() - Translate a block number from the layer's view + * to the partition's. + * @partition: The partition to use for translation. + * @layer_block_number: The block number relative to the layer. + * @partition_block_number: The block number relative to the partition. + * + * This is the inverse of vdo_translate_to_pbn(). + * + * Return: VDO_SUCCESS or an error code. + */ +int vdo_translate_from_pbn(const struct partition *partition, + physical_block_number_t layer_block_number, + physical_block_number_t *partition_block_number_ptr) +{ + physical_block_number_t partition_block_number; + + if (partition == NULL) { + *partition_block_number_ptr = layer_block_number; + return VDO_SUCCESS; + } + + if (layer_block_number < partition->offset) { + return VDO_OUT_OF_RANGE; + } + + partition_block_number = layer_block_number - partition->offset; + if (partition_block_number >= partition->count) { + return VDO_OUT_OF_RANGE; + } + + *partition_block_number_ptr = partition_block_number + partition->base; + return VDO_SUCCESS; +} + +/** + * vdo_get_fixed_layout_blocks_available() - Return the number of unallocated + * blocks available. + * @layout: The fixed layout. + * + * Return: The number of blocks yet unallocated to partitions. + */ +block_count_t +vdo_get_fixed_layout_blocks_available(const struct fixed_layout *layout) +{ + return layout->last_free - layout->first_free; +} + +/** + * allocate_partition() - Allocate a partition. + * @layout: The layout containing the partition. + * @id: The id of the partition. + * @offset: The offset into the layout at which the partition begins. + * @base: The number of the first block for users of the partition. + * @block_count: The number of blocks in the partition. + * + * The partition will be attached to the partition list in the layout. + * + * Return: VDO_SUCCESS or an error. + */ +static int allocate_partition(struct fixed_layout *layout, + byte id, + physical_block_number_t offset, + physical_block_number_t base, + block_count_t block_count) +{ + struct partition *partition; + int result = UDS_ALLOCATE(1, struct partition, + "fixed layout partition", &partition); + if (result != UDS_SUCCESS) { + return result; + } + + partition->id = id; + partition->layout = layout; + partition->offset = offset; + partition->base = base; + partition->count = block_count; + partition->next = layout->head; + layout->head = partition; + + return VDO_SUCCESS; +} + +/** + * vdo_make_fixed_layout_partition() - Create a new partition from the + * beginning or end of the unused space + * within a fixed layout. + * @layout: The fixed layout. + * @id: The id of the partition to make. + * @block_count: The number of blocks to carve out, if set to + * VDO_ALL_FREE_BLOCKS, all remaining blocks will be used. + * @direction: Whether to carve out from beginning or end. + * @base: The number of the first block in the partition from the point of + * view of its users. + * + * Return: A success or error code, particularly VDO_NO_SPACE if there are + * less than block_count blocks remaining. + */ +int vdo_make_fixed_layout_partition(struct fixed_layout *layout, + enum partition_id id, + block_count_t block_count, + enum partition_direction direction, + physical_block_number_t base) +{ + int result; + physical_block_number_t offset; + + block_count_t free_blocks = layout->last_free - layout->first_free; + + if (block_count == VDO_ALL_FREE_BLOCKS) { + if (free_blocks == 0) { + return VDO_NO_SPACE; + } else { + block_count = free_blocks; + } + } else if (block_count > free_blocks) { + return VDO_NO_SPACE; + } + + result = vdo_get_fixed_layout_partition(layout, id, NULL); + if (result != VDO_UNKNOWN_PARTITION) { + return VDO_PARTITION_EXISTS; + } + + offset = ((direction == VDO_PARTITION_FROM_END) ? + (layout->last_free - block_count) : layout->first_free); + result = allocate_partition(layout, id, offset, base, block_count); + if (result != VDO_SUCCESS) { + return result; + } + + layout->num_partitions++; + if (direction == VDO_PARTITION_FROM_END) { + layout->last_free = layout->last_free - block_count; + } else { + layout->first_free += block_count; + } + + return VDO_SUCCESS; +} + +/** + * vdo_get_fixed_layout_partition_size() - Return the size in blocks of a + * partition. + * @partition: A partition of the fixed_layout. + * + * Return: The size of the partition in blocks. + */ +block_count_t +vdo_get_fixed_layout_partition_size(const struct partition *partition) +{ + return partition->count; +} + +/** + * vdo_get_fixed_layout_partition_offset() - Get the first block of the + * partition in the layout. + * @partition: A partition of the fixed_layout. + * + * Return: The partition's offset in blocks. + */ +physical_block_number_t +vdo_get_fixed_layout_partition_offset(const struct partition *partition) +{ + return partition->offset; +} + +/** + * vdo_get_fixed_layout_partition_base() - Get the number of the first block + * in the partition from the partition + * user's point of view. + * @partition: A partition of the fixed_layout. + * + * Return: The number of the first block in the partition. + */ +physical_block_number_t +vdo_get_fixed_layout_partition_base(const struct partition *partition) +{ + return partition->base; +} + +/** + * get_encoded_size() - Get the size of an encoded layout + * @layout: The layout. + * + * Return: The encoded size of the layout. + */ +static inline size_t get_encoded_size(const struct fixed_layout *layout) +{ + return sizeof(struct layout_3_0) + + (sizeof(struct partition_3_0) * layout->num_partitions); +} + +size_t vdo_get_fixed_layout_encoded_size(const struct fixed_layout *layout) +{ + return VDO_ENCODED_HEADER_SIZE + get_encoded_size(layout); +} + +/** + * encode_partitions_3_0() - Encode a null-terminated list of fixed layout + * partitions into a buffer using partition format + * 3.0. + * @layout: The layout containing the list of partitions to encode. + * @buffer: A buffer positioned at the start of the encoding. + * + * Return: UDS_SUCCESS or an error code. + */ +static int encode_partitions_3_0(const struct fixed_layout *layout, + struct buffer *buffer) +{ + const struct partition *partition; + + for (partition = layout->head; + partition != NULL; + partition = partition->next) { + int result; + + STATIC_ASSERT_SIZEOF(enum partition_id, sizeof(byte)); + result = put_byte(buffer, partition->id); + if (result != UDS_SUCCESS) { + return result; + } + + result = put_uint64_le_into_buffer(buffer, partition->offset); + if (result != UDS_SUCCESS) { + return result; + } + + result = put_uint64_le_into_buffer(buffer, partition->base); + if (result != UDS_SUCCESS) { + return result; + } + + result = put_uint64_le_into_buffer(buffer, partition->count); + if (result != UDS_SUCCESS) { + return result; + } + } + + return UDS_SUCCESS; +} + +/** + * encode_layout_3_0() - Encode the header fields of a fixed layout into a + * buffer using layout format 3.0. + * @layout: The layout to encode. + * @buffer: A buffer positioned at the start of the encoding. + * + * Return: UDS_SUCCESS or an error code. + */ +static int encode_layout_3_0(const struct fixed_layout *layout, + struct buffer *buffer) +{ + int result = ASSERT(layout->num_partitions <= UINT8_MAX, + "fixed layout partition count must fit in a byte"); + if (result != UDS_SUCCESS) { + return result; + } + + result = put_uint64_le_into_buffer(buffer, layout->first_free); + if (result != UDS_SUCCESS) { + return result; + } + + result = put_uint64_le_into_buffer(buffer, layout->last_free); + if (result != UDS_SUCCESS) { + return result; + } + + return put_byte(buffer, layout->num_partitions); +} + +/** + * vdo_encode_fixed_layout() - Encode a layout into a buffer. + * @layout: The layout to encode. + * @buffer: The buffer to encode into. + * + * Return: UDS_SUCCESS or an error. + */ +int vdo_encode_fixed_layout(const struct fixed_layout *layout, + struct buffer *buffer) +{ + size_t initial_length, encoded_size; + int result; + + struct header header = LAYOUT_HEADER_3_0; + + if (!ensure_available_space(buffer, + vdo_get_fixed_layout_encoded_size(layout))) { + return UDS_BUFFER_ERROR; + } + + header.size = get_encoded_size(layout); + result = vdo_encode_header(&header, buffer); + if (result != UDS_SUCCESS) { + return result; + } + + initial_length = content_length(buffer); + + result = encode_layout_3_0(layout, buffer); + if (result != UDS_SUCCESS) { + return result; + } + + encoded_size = content_length(buffer) - initial_length; + result = ASSERT(encoded_size == sizeof(struct layout_3_0), + "encoded size of fixed layout header must match structure"); + if (result != UDS_SUCCESS) { + return result; + } + + result = encode_partitions_3_0(layout, buffer); + if (result != UDS_SUCCESS) { + return result; + } + + encoded_size = content_length(buffer) - initial_length; + return ASSERT(encoded_size == header.size, + "encoded size of fixed layout must match header size"); +} + +/** + * decode_partitions_3_0() - Decode a sequence of fixed layout partitions from + * a buffer using partition format 3.0. + * @buffer: A buffer positioned at the start of the encoding. + * @layout: The layout in which to allocate the decoded partitions. + * + * Return: UDS_SUCCESS or an error code. + */ +static int decode_partitions_3_0(struct buffer *buffer, + struct fixed_layout *layout) +{ + size_t i; + + for (i = 0; i < layout->num_partitions; i++) { + byte id; + uint64_t offset, base, count; + int result = get_byte(buffer, &id); + + if (result != UDS_SUCCESS) { + return result; + } + + result = get_uint64_le_from_buffer(buffer, &offset); + if (result != UDS_SUCCESS) { + return result; + } + + result = get_uint64_le_from_buffer(buffer, &base); + if (result != UDS_SUCCESS) { + return result; + } + + result = get_uint64_le_from_buffer(buffer, &count); + if (result != UDS_SUCCESS) { + return result; + } + + result = allocate_partition(layout, id, offset, base, count); + if (result != VDO_SUCCESS) { + return result; + } + } + + return UDS_SUCCESS; +} + +/** + * decode_layout_3_0() - Decode the header fields of a fixed layout from a + * buffer using layout format 3.0. + * @buffer: A buffer positioned at the start of the encoding. + * @layout: The structure to receive the decoded fields. + * + * Return: UDS_SUCCESS or an error code. + */ +static int decode_layout_3_0(struct buffer *buffer, struct layout_3_0 *layout) +{ + size_t decoded_size, initial_length = content_length(buffer); + physical_block_number_t first_free, last_free; + byte partition_count; + + int result = get_uint64_le_from_buffer(buffer, &first_free); + + if (result != UDS_SUCCESS) { + return result; + } + + result = get_uint64_le_from_buffer(buffer, &last_free); + if (result != UDS_SUCCESS) { + return result; + } + + result = get_byte(buffer, &partition_count); + if (result != UDS_SUCCESS) { + return result; + } + + *layout = (struct layout_3_0) { + .first_free = first_free, + .last_free = last_free, + .partition_count = partition_count, + }; + + decoded_size = initial_length - content_length(buffer); + return ASSERT(decoded_size == sizeof(struct layout_3_0), + "decoded size of fixed layout header must match structure"); +} + +/** + * vdo_decode_fixed_layout() - Decode a fixed layout from a buffer. + * @buffer: The buffer from which to decode. + * @layout_ptr: A pointer to hold the layout. + * + * Return: VDO_SUCCESS or an error. + */ +int vdo_decode_fixed_layout(struct buffer *buffer, + struct fixed_layout **layout_ptr) +{ + struct header header; + struct layout_3_0 layout_header; + struct fixed_layout *layout; + + int result = vdo_decode_header(buffer, &header); + + if (result != UDS_SUCCESS) { + return result; + } + + /* Layout is variable size, so only do a minimum size check here. */ + result = vdo_validate_header(&LAYOUT_HEADER_3_0, &header, false, __func__); + if (result != VDO_SUCCESS) { + return result; + } + + result = decode_layout_3_0(buffer, &layout_header); + if (result != UDS_SUCCESS) { + return result; + } + + if (content_length(buffer) < + (sizeof(struct partition_3_0) * layout_header.partition_count)) { + return VDO_UNSUPPORTED_VERSION; + } + + result = UDS_ALLOCATE(1, struct fixed_layout, "fixed layout", &layout); + if (result != UDS_SUCCESS) { + return result; + } + + layout->first_free = layout_header.first_free; + layout->last_free = layout_header.last_free; + layout->num_partitions = layout_header.partition_count; + + result = decode_partitions_3_0(buffer, layout); + if (result != VDO_SUCCESS) { + vdo_free_fixed_layout(layout); + return result; + } + + *layout_ptr = layout; + return VDO_SUCCESS; +} + +/** + * vdo_make_partitioned_fixed_layout() - Make a partitioned fixed layout for a + * VDO. + * @physical_blocks: The number of physical blocks in the VDO. + * @starting_offset: The starting offset of the layout. + * @block_map_blocks: The size of the block map partition. + * @journal_blocks: The size of the journal partition. + * @summary_blocks: The size of the slab summary partition. + * @layout_ptr: A pointer to hold the new fixed_layout. + * + * Return: VDO_SUCCESS or an error. + */ +int vdo_make_partitioned_fixed_layout(block_count_t physical_blocks, + physical_block_number_t starting_offset, + block_count_t block_map_blocks, + block_count_t journal_blocks, + block_count_t summary_blocks, + struct fixed_layout **layout_ptr) +{ + struct fixed_layout *layout; + int result; + + block_count_t necessary_size = (starting_offset + block_map_blocks + + journal_blocks + summary_blocks); + if (necessary_size > physical_blocks) { + return uds_log_error_strerror(VDO_NO_SPACE, + "Not enough space to make a VDO"); + } + + result = vdo_make_fixed_layout(physical_blocks - starting_offset, + starting_offset, + &layout); + if (result != VDO_SUCCESS) { + return result; + } + + result = vdo_make_fixed_layout_partition(layout, + VDO_BLOCK_MAP_PARTITION, + block_map_blocks, + VDO_PARTITION_FROM_BEGINNING, + 0); + if (result != VDO_SUCCESS) { + vdo_free_fixed_layout(layout); + return result; + } + + result = vdo_make_fixed_layout_partition(layout, + VDO_SLAB_SUMMARY_PARTITION, + summary_blocks, + VDO_PARTITION_FROM_END, 0); + if (result != VDO_SUCCESS) { + vdo_free_fixed_layout(layout); + return result; + } + + result = vdo_make_fixed_layout_partition(layout, + VDO_RECOVERY_JOURNAL_PARTITION, + journal_blocks, + VDO_PARTITION_FROM_END, 0); + if (result != VDO_SUCCESS) { + vdo_free_fixed_layout(layout); + return result; + } + + /* + * The block allocator no longer traffics in relative PBNs so the offset + * doesn't matter. We need to keep this partition around both for + * upgraded systems, and because we decided that all of the usable space + * in the volume, other than the super block, should be part of some + * partition. + */ + result = vdo_make_fixed_layout_partition(layout, + VDO_BLOCK_ALLOCATOR_PARTITION, + VDO_ALL_FREE_BLOCKS, + VDO_PARTITION_FROM_BEGINNING, + block_map_blocks); + if (result != VDO_SUCCESS) { + vdo_free_fixed_layout(layout); + return result; + } + + *layout_ptr = layout; + return VDO_SUCCESS; +} + +/*-----------------------------------------------------------------*/ +static const enum partition_id REQUIRED_PARTITIONS[] = { + VDO_BLOCK_MAP_PARTITION, + VDO_BLOCK_ALLOCATOR_PARTITION, + VDO_RECOVERY_JOURNAL_PARTITION, + VDO_SLAB_SUMMARY_PARTITION, +}; + +static const uint8_t REQUIRED_PARTITION_COUNT = 4; + +/** + * get_partition_offset() - Get the offset of a given partition. + * @layout: The layout containing the partition. + * @id: The ID of the partition whose offset is desired. + * + * Return: The offset of the partition (in blocks). + */ +static block_count_t __must_check +get_partition_offset(struct vdo_layout *layout, enum partition_id id) +{ + return vdo_get_fixed_layout_partition_offset(vdo_get_partition(layout, + id)); +} + +/** + * vdo_decode_layout() - Make a vdo_layout from the fixed_layout decoded from + * the super block. + * @layout: The fixed_layout from the super block. + * @vdo_layout_ptr: A pointer to hold the vdo_layout. + * + * Return: VDO_SUCCESS or an error. + */ +int vdo_decode_layout(struct fixed_layout *layout, + struct vdo_layout **vdo_layout_ptr) +{ + /* Check that all the expected partitions exist */ + struct vdo_layout *vdo_layout; + struct partition *partition; + uint8_t i; + int result; + + for (i = 0; i < REQUIRED_PARTITION_COUNT; i++) { + result = vdo_get_fixed_layout_partition(layout, + REQUIRED_PARTITIONS[i], + &partition); + if (result != VDO_SUCCESS) { + return uds_log_error_strerror(result, + "VDO layout is missing required partition %u", + REQUIRED_PARTITIONS[i]); + } + } + + result = UDS_ALLOCATE(1, struct vdo_layout, __func__, &vdo_layout); + if (result != VDO_SUCCESS) { + return result; + } + + vdo_layout->layout = layout; + + /* XXX Assert this is the same as where we loaded the super block. */ + vdo_layout->starting_offset = + get_partition_offset(vdo_layout, VDO_BLOCK_MAP_PARTITION); + + *vdo_layout_ptr = vdo_layout; + return VDO_SUCCESS; +} + +/** + * vdo_free_layout() - Free a vdo_layout. + * @vdo_layout: The vdo_layout to free. + */ +void vdo_free_layout(struct vdo_layout *vdo_layout) +{ + if (vdo_layout == NULL) { + return; + } + + if (vdo_layout->copier) { + dm_kcopyd_client_destroy(UDS_FORGET(vdo_layout->copier)); + } + vdo_free_fixed_layout(UDS_FORGET(vdo_layout->next_layout)); + vdo_free_fixed_layout(UDS_FORGET(vdo_layout->layout)); + vdo_free_fixed_layout(UDS_FORGET(vdo_layout->previous_layout)); + UDS_FREE(vdo_layout); +} + +/** + * retrieve_partition() - Get a partition from a fixed_layout in conditions + * where we expect that it can not fail. + * @layout: The fixed_layout from which to get the partition. + * @id: The ID of the partition to retrieve. + * + * Return: The desired partition. + */ +static struct partition * __must_check +retrieve_partition(struct fixed_layout *layout, enum partition_id id) +{ + struct partition *partition; + int result = vdo_get_fixed_layout_partition(layout, id, &partition); + + ASSERT_LOG_ONLY(result == VDO_SUCCESS, + "vdo_layout has expected partition"); + return partition; +} + +/** + * vdo_get_partition() - Get a partition from a vdo_layout. + * @vdo_layout: The vdo_layout from which to get the partition. + * @id: The ID of the desired partition. + * + * Because the layout's fixed_layout has already been validated, this can not + * fail. + * + * Return: The requested partition. + */ +struct partition *vdo_get_partition(struct vdo_layout *vdo_layout, + enum partition_id id) +{ + return retrieve_partition(vdo_layout->layout, id); +} + +/** + * get_partition_from_next_layout() - Get a partition from a vdo_layout's next + * fixed_layout. + * @vdo_layout: The vdo_layout from which to get the partition. + * @id: The ID of the desired partition. + * + * This method should only be called when the vdo_layout is prepared to grow. + * + * Return: The requested partition. + */ +static struct partition * __must_check +get_partition_from_next_layout(struct vdo_layout *vdo_layout, + enum partition_id id) +{ + ASSERT_LOG_ONLY(vdo_layout->next_layout != NULL, + "vdo_layout is prepared to grow"); + return retrieve_partition(vdo_layout->next_layout, id); +} + +/** + * get_partition_size() - Get the size of a given partition. + * @layout: The layout containing the partition. + * @id: The partition ID whose size to find. + * + * Return: The size of the partition (in blocks). + */ +static block_count_t __must_check +get_partition_size(struct vdo_layout *layout, enum partition_id id) +{ + struct partition *partition = vdo_get_partition(layout, id); + + return vdo_get_fixed_layout_partition_size(partition); +} + +/** + * prepare_to_vdo_grow_layout() - Prepare the layout to be grown. + * @vdo_layout: The layout to grow. + * @old_physical_blocks: The current size of the VDO. + * @new_physical_blocks: The size to which the VDO will be grown. + * + * Return: VDO_SUCCESS or an error code. + */ +int prepare_to_vdo_grow_layout(struct vdo_layout *vdo_layout, + block_count_t old_physical_blocks, + block_count_t new_physical_blocks) +{ + int result; + struct partition *slab_summary_partition, *recovery_journal_partition; + block_count_t min_new_size; + + if (vdo_get_next_layout_size(vdo_layout) == new_physical_blocks) { + /* + * We are already prepared to grow to the new size, so we're + * done. + */ + return VDO_SUCCESS; + } + + /* Make a copy completion if there isn't one */ + if (vdo_layout->copier == NULL) { + vdo_layout->copier = dm_kcopyd_client_create(NULL); + if (vdo_layout->copier == NULL) { + return -ENOMEM; + } + } + + /* Free any unused preparation. */ + vdo_free_fixed_layout(UDS_FORGET(vdo_layout->next_layout)); + + /* + * Make a new layout with the existing partition sizes for everything + * but the block allocator partition. + */ + result = vdo_make_partitioned_fixed_layout(new_physical_blocks, + vdo_layout->starting_offset, + get_partition_size(vdo_layout, + VDO_BLOCK_MAP_PARTITION), + get_partition_size(vdo_layout, + VDO_RECOVERY_JOURNAL_PARTITION), + get_partition_size(vdo_layout, + VDO_SLAB_SUMMARY_PARTITION), + &vdo_layout->next_layout); + if (result != VDO_SUCCESS) { + dm_kcopyd_client_destroy(UDS_FORGET(vdo_layout->copier)); + return result; + } + + /* + * Ensure the new journal and summary are entirely within the added + * blocks. + */ + slab_summary_partition = + get_partition_from_next_layout(vdo_layout, + VDO_SLAB_SUMMARY_PARTITION); + recovery_journal_partition = + get_partition_from_next_layout(vdo_layout, + VDO_RECOVERY_JOURNAL_PARTITION); + min_new_size = + (old_physical_blocks + + vdo_get_fixed_layout_partition_size(slab_summary_partition) + + vdo_get_fixed_layout_partition_size(recovery_journal_partition)); + if (min_new_size > new_physical_blocks) { + /* + * Copying the journal and summary would destroy some old + * metadata. + */ + vdo_free_fixed_layout(UDS_FORGET(vdo_layout->next_layout)); + dm_kcopyd_client_destroy(UDS_FORGET(vdo_layout->copier)); + return VDO_INCREMENT_TOO_SMALL; + } + + return VDO_SUCCESS; +} + +/** + * get_vdo_size() - Get the size of a VDO from the specified fixed_layout and + * the starting offset thereof. + * @layout: The fixed layout whose size to use. + * @starting_offset: The starting offset of the layout. + * + * Return: The total size of a VDO (in blocks) with the given layout. + */ +static block_count_t __must_check +get_vdo_size(struct fixed_layout *layout, block_count_t starting_offset) +{ + /* + * The fixed_layout does not include the super block or any earlier + * metadata; all that is captured in the vdo_layout's starting offset + */ + return vdo_get_total_fixed_layout_size(layout) + starting_offset; +} + +/** + * vdo_get_next_layout_size() - Get the size of the next layout. + * @vdo_layout: The layout to check. + * + * Return: The size which was specified when the layout was prepared for + * growth or 0 if the layout is not prepared to grow. + */ +block_count_t vdo_get_next_layout_size(struct vdo_layout *vdo_layout) +{ + return ((vdo_layout->next_layout == NULL) ? + 0 : + get_vdo_size(vdo_layout->next_layout, + vdo_layout->starting_offset)); +} + +/** + * vdo_get_next_block_allocator_partition_size() - Get the size of the next + * block allocator partition. + * @vdo_layout: The vdo_layout which has been prepared to grow. + * + * Return: The size of the block allocator partition in the next layout or 0 + * if the layout is not prepared to grow. + */ +block_count_t +vdo_get_next_block_allocator_partition_size(struct vdo_layout *vdo_layout) +{ + struct partition *partition; + + if (vdo_layout->next_layout == NULL) { + return 0; + } + + partition = get_partition_from_next_layout(vdo_layout, + VDO_BLOCK_ALLOCATOR_PARTITION); + return vdo_get_fixed_layout_partition_size(partition); +} + +/** + * vdo_grow_layout() - Grow the layout by swapping in the prepared layout. + * @vdo_layout: The layout to grow. + * + * Return: The new size of the VDO. + */ +block_count_t vdo_grow_layout(struct vdo_layout *vdo_layout) +{ + ASSERT_LOG_ONLY(vdo_layout->next_layout != NULL, + "VDO prepared to grow physical"); + vdo_layout->previous_layout = vdo_layout->layout; + vdo_layout->layout = vdo_layout->next_layout; + vdo_layout->next_layout = NULL; + + return get_vdo_size(vdo_layout->layout, vdo_layout->starting_offset); +} + +/** + * vdo_finish_layout_growth() - Clean up any unused resources once an attempt + * to grow has completed. + * @vdo_layout: The layout. + */ +void vdo_finish_layout_growth(struct vdo_layout *vdo_layout) +{ + if (vdo_layout->layout != vdo_layout->previous_layout) { + vdo_free_fixed_layout(UDS_FORGET(vdo_layout->previous_layout)); + } + + if (vdo_layout->layout != vdo_layout->next_layout) { + vdo_free_fixed_layout(UDS_FORGET(vdo_layout->next_layout)); + } +} + +static void copy_callback(int read_err, unsigned long write_err, void *context) +{ + struct vdo_completion *completion = context; + int result = (((read_err == 0) && (write_err == 0)) + ? VDO_SUCCESS : -EIO ); + vdo_finish_completion(completion, result); +} + +static int partition_to_region(struct partition *partition, + struct vdo *vdo, + struct dm_io_region *region) +{ + block_count_t blocks + = vdo_get_fixed_layout_partition_size(partition); + physical_block_number_t pbn; + + int result = vdo_translate_to_pbn(partition, 0, &pbn); + + if (result != VDO_SUCCESS) { + return result; + } + + pbn -= vdo->geometry.bio_offset; + + *region = (struct dm_io_region) { + .bdev = vdo_get_backing_device(vdo), + .sector = pbn * VDO_SECTORS_PER_BLOCK, + .count = blocks * VDO_SECTORS_PER_BLOCK, + }; + return VDO_SUCCESS; +} + +/** + * vdo_copy_layout_partition() - Copy a partition from the location specified + * in the current layout to that in the next + * layout. + * @layout: The vdo_layout which is prepared to grow. + * @id: The ID of the partition to copy. + * @parent: The completion to notify when the copy is complete. + */ +void vdo_copy_layout_partition(struct vdo_layout *layout, + enum partition_id id, + struct vdo_completion *parent) +{ + struct vdo *vdo = parent->vdo; + struct dm_io_region read_region, write_regions[1]; + int result = VDO_SUCCESS; + + struct partition *from = vdo_get_partition(layout, id); + struct partition *to = get_partition_from_next_layout(layout, id); + + result = partition_to_region(from, vdo, &read_region); + if (result != VDO_SUCCESS) { + vdo_finish_completion(parent, result); + return; + } + + result = partition_to_region(to, vdo, &write_regions[0]); + if (result != VDO_SUCCESS) { + vdo_finish_completion(parent, result); + return; + } + + dm_kcopyd_copy(layout->copier, &read_region, 1, write_regions, 0, + copy_callback, parent); } + +/** + * vdo_get_fixed_layout() - Get the current fixed layout of the vdo. + * @vdo_layout: The layout. + * + * Return: The layout's current fixed layout. + */ +struct fixed_layout *vdo_get_fixed_layout(const struct vdo_layout *vdo_layout) +{ + return vdo_layout->layout; +} diff --git a/vdo/vdo-layout.h b/vdo/vdo-layout.h new file mode 100644 index 00000000..0e59f141 --- /dev/null +++ b/vdo/vdo-layout.h @@ -0,0 +1,149 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright Red Hat + */ + +/** + * DOC: VDO Layout. + * + * vdo_layout is an object which manages the layout of a VDO. It wraps + * fixed_layout, but includes the knowledge of exactly which partitions a VDO + * is expected to have. Because of this knowledge, the vdo_layout validates + * the fixed_layout encoded in the super block at load time, obviating the + * need for subsequent error checking when other modules need to get + * partitions from the layout. + * + * The vdo_layout also manages the preparation and growth of the layout for + * grow physical operations. + */ + +#ifndef VDO_LAYOUT_H +#define VDO_LAYOUT_H + +#include "buffer.h" + +#include "kernel-types.h" + +#include "types.h" + +enum partition_direction { + VDO_PARTITION_FROM_BEGINNING, + VDO_PARTITION_FROM_END, +}; + +extern const block_count_t VDO_ALL_FREE_BLOCKS; + +/* + * A fixed layout is like a traditional disk partitioning scheme. In the + * beginning there is one large unused area, of which parts are carved off. + * Each carved off section has its own internal offset and size. + */ +struct fixed_layout; +struct partition; + +int __must_check vdo_make_fixed_layout(block_count_t total_blocks, + physical_block_number_t start_offset, + struct fixed_layout **layout_ptr); + +void vdo_free_fixed_layout(struct fixed_layout *layout); + +block_count_t __must_check +vdo_get_total_fixed_layout_size(const struct fixed_layout *layout); + +int __must_check +vdo_get_fixed_layout_partition(struct fixed_layout *layout, + enum partition_id id, + struct partition **partition_ptr); + +int __must_check +vdo_translate_to_pbn(const struct partition *partition, + physical_block_number_t partition_block_number, + physical_block_number_t *layer_block_number); + +int __must_check +vdo_translate_from_pbn(const struct partition *partition, + physical_block_number_t layer_block_number, + physical_block_number_t *partition_block_number); + +block_count_t __must_check +vdo_get_fixed_layout_blocks_available(const struct fixed_layout *layout); + +int __must_check +vdo_make_fixed_layout_partition(struct fixed_layout *layout, + enum partition_id id, + block_count_t block_count, + enum partition_direction direction, + physical_block_number_t base); + +block_count_t __must_check +vdo_get_fixed_layout_partition_size(const struct partition *partition); + +physical_block_number_t __must_check +vdo_get_fixed_layout_partition_offset(const struct partition *partition); + +physical_block_number_t __must_check +vdo_get_fixed_layout_partition_base(const struct partition *partition); + +size_t __must_check +vdo_get_fixed_layout_encoded_size(const struct fixed_layout *layout); + +int __must_check +vdo_encode_fixed_layout(const struct fixed_layout *layout, struct buffer *buffer); + +int __must_check +vdo_decode_fixed_layout(struct buffer *buffer, struct fixed_layout **layout_ptr); + +int __must_check +vdo_make_partitioned_fixed_layout(block_count_t physical_blocks, + physical_block_number_t starting_offset, + block_count_t block_map_blocks, + block_count_t journal_blocks, + block_count_t summary_blocks, + struct fixed_layout **layout_ptr); + +/*-----------------------------------------------------------------*/ + +struct vdo_layout { + /* The current layout of the VDO */ + struct fixed_layout *layout; + /* The next layout of the VDO */ + struct fixed_layout *next_layout; + /* The previous layout of the VDO */ + struct fixed_layout *previous_layout; + /* The first block in the layouts */ + physical_block_number_t starting_offset; + /* A pointer to the copy completion (if there is one) */ + struct dm_kcopyd_client *copier; +}; + +int __must_check vdo_decode_layout(struct fixed_layout *layout, + struct vdo_layout **vdo_layout_ptr); + +void vdo_free_layout(struct vdo_layout *vdo_layout); + +struct partition * __must_check +vdo_get_partition(struct vdo_layout *vdo_layout, enum partition_id id); + +int __must_check +prepare_to_vdo_grow_layout(struct vdo_layout *vdo_layout, + block_count_t old_physical_blocks, + block_count_t new_physical_blocks); + +block_count_t __must_check +vdo_get_next_layout_size(struct vdo_layout *vdo_layout); + +block_count_t __must_check +vdo_get_next_block_allocator_partition_size(struct vdo_layout *vdo_layout); + +block_count_t __must_check vdo_grow_layout(struct vdo_layout *vdo_layout); + +void vdo_finish_layout_growth(struct vdo_layout *vdo_layout); + +void vdo_copy_layout_partition(struct vdo_layout *layout, + enum partition_id id, + struct vdo_completion *parent); + +struct fixed_layout * __must_check +vdo_get_fixed_layout(const struct vdo_layout *vdo_layout); + +#endif /* VDO_LAYOUT_H */ diff --git a/vdo/vdo-load.c b/vdo/vdo-load.c new file mode 100644 index 00000000..c8d9d51e --- /dev/null +++ b/vdo/vdo-load.c @@ -0,0 +1,618 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright Red Hat + */ + +#include "vdo-load.h" + +#include "logger.h" +#include "memory-alloc.h" + +#include "admin-completion.h" +#include "block-map.h" +#include "completion.h" +#include "constants.h" +#include "dedupe-index.h" +#include "device-config.h" +#include "hash-zone.h" +#include "header.h" +#include "kernel-types.h" +#include "logical-zone.h" +#include "physical-zone.h" +#include "pool-sysfs.h" +#include "read-only-rebuild.h" +#include "recovery-journal.h" +#include "release-versions.h" +#include "slab-depot.h" +#include "slab-summary.h" +#include "super-block-codec.h" +#include "thread-config.h" +#include "types.h" +#include "vdo.h" +#include "vdo-recovery.h" +#include "vdo-suspend.h" + +enum { + LOAD_PHASE_START, + LOAD_PHASE_STATS, + LOAD_PHASE_LOAD_DEPOT, + LOAD_PHASE_MAKE_DIRTY, + LOAD_PHASE_PREPARE_TO_ALLOCATE, + LOAD_PHASE_SCRUB_SLABS, + LOAD_PHASE_DATA_REDUCTION, + LOAD_PHASE_FINISHED, + LOAD_PHASE_DRAIN_JOURNAL, + LOAD_PHASE_WAIT_FOR_READ_ONLY, +}; + +static const char *LOAD_PHASE_NAMES[] = { + "LOAD_PHASE_START", + "LOAD_PHASE_STATS", + "LOAD_PHASE_LOAD_DEPOT", + "LOAD_PHASE_MAKE_DIRTY", + "LOAD_PHASE_PREPARE_TO_ALLOCATE", + "LOAD_PHASE_SCRUB_SLABS", + "LOAD_PHASE_DATA_REDUCTION", + "LOAD_PHASE_FINISHED", + "LOAD_PHASE_DRAIN_JOURNAL", + "LOAD_PHASE_WAIT_FOR_READ_ONLY", +}; + + +/** + * get_thread_id_for_phase() - Implements vdo_thread_id_getter_for_phase. + */ +static thread_id_t __must_check +get_thread_id_for_phase(struct admin_completion *admin_completion) +{ + const struct thread_config *thread_config = + admin_completion->vdo->thread_config; + switch (admin_completion->phase) { + case LOAD_PHASE_DRAIN_JOURNAL: + return thread_config->journal_thread; + + default: + return thread_config->admin_thread; + } +} + +/** + * vdo_from_load_sub_task() - Extract the vdo from an admin_completion, + * checking that the current operation is a load. + * @completion: The admin_completion's sub-task completion. + * + * Return: The vdo. + */ +static inline struct vdo * +vdo_from_load_sub_task(struct vdo_completion *completion) +{ + return vdo_from_admin_sub_task(completion, VDO_ADMIN_OPERATION_LOAD); +} + +/** + * was_new() - Check whether the vdo was new when it was loaded. + * @vdo: The vdo to query. + * + * Return: true if the vdo was new. + */ +static bool was_new(const struct vdo *vdo) +{ + return (vdo->load_state == VDO_NEW); +} + +/** + * requires_read_only_rebuild() - Check whether the vdo requires a read-only + * mode rebuild. + * @vdo: The vdo to query. + * + * Return: true if the vdo requires a read-only rebuild. + */ +static bool __must_check requires_read_only_rebuild(const struct vdo *vdo) +{ + return ((vdo->load_state == VDO_FORCE_REBUILD) || + (vdo->load_state == VDO_REBUILD_FOR_UPGRADE)); +} + +/** + * requires_recovery() - Check whether a vdo should enter recovery mode. + * @vdo: The vdo to query. + * + * Return: true if the vdo requires recovery. + */ +static bool __must_check requires_recovery(const struct vdo *vdo) +{ + return ((vdo->load_state == VDO_DIRTY) || + (vdo->load_state == VDO_REPLAYING) || + (vdo->load_state == VDO_RECOVERING)); +} + +/** + * requires_rebuild() - Check whether a vdo requires rebuilding. + * @vdo: The vdo to query. + * + * Return: true if the vdo must be rebuilt. + */ +static bool __must_check requires_rebuild(const struct vdo *vdo) +{ + switch (vdo_get_state(vdo)) { + case VDO_DIRTY: + case VDO_FORCE_REBUILD: + case VDO_REPLAYING: + case VDO_REBUILD_FOR_UPGRADE: + return true; + + default: + return false; + } +} + +/** + * get_load_type() - Determine how the slab depot was loaded. + * @vdo: The vdo. + * + * Return: How the depot was loaded. + */ +static enum slab_depot_load_type get_load_type(struct vdo *vdo) +{ + if (requires_read_only_rebuild(vdo)) { + return VDO_SLAB_DEPOT_REBUILD_LOAD; + } + + if (requires_recovery(vdo)) { + return VDO_SLAB_DEPOT_RECOVERY_LOAD; + } + + return VDO_SLAB_DEPOT_NORMAL_LOAD; +} + +/** + * vdo_initialize_kobjects() - Initialize the vdo sysfs directory. + * @vdo: The vdo being initialized. + * + * Return: VDO_SUCCESS or an error code. + */ +static int vdo_initialize_kobjects(struct vdo *vdo) +{ + int result; + struct dm_target *target = vdo->device_config->owning_target; + struct mapped_device *md = dm_table_get_md(target->table); + + kobject_init(&vdo->vdo_directory, &vdo_directory_type); + vdo->sysfs_added = true; + result = kobject_add(&vdo->vdo_directory, + &disk_to_dev(dm_disk(md))->kobj, + "vdo"); + if (result != 0) { + return VDO_CANT_ADD_SYSFS_NODE; + } + + result = vdo_add_dedupe_index_sysfs(vdo->dedupe_index, + &vdo->vdo_directory); + if (result != 0) { + return VDO_CANT_ADD_SYSFS_NODE; + } + + return vdo_add_sysfs_stats_dir(vdo); +} + +/** + * load_callback() - Callback to do the destructive parts of loading a VDO. + * @completion: The sub-task completion. + */ +static void load_callback(struct vdo_completion *completion) +{ + struct admin_completion *admin_completion = + vdo_admin_completion_from_sub_task(completion); + struct vdo *vdo = vdo_from_load_sub_task(completion); + + vdo_assert_admin_phase_thread(admin_completion, + __func__, + LOAD_PHASE_NAMES); + + switch (admin_completion->phase++) { + case LOAD_PHASE_START: + if (!vdo_start_operation_with_waiter(&vdo->admin_state, + VDO_ADMIN_STATE_LOADING, + &admin_completion->completion, + NULL)) { + return; + } + + /* Prepare the recovery journal for new entries. */ + vdo_open_recovery_journal(vdo->recovery_journal, + vdo->depot, + vdo->block_map); + vdo_allow_read_only_mode_entry(vdo->read_only_notifier, + vdo_reset_admin_sub_task(completion)); + return; + + case LOAD_PHASE_STATS: + vdo_finish_completion(vdo_reset_admin_sub_task(completion), + vdo_initialize_kobjects(vdo)); + return; + + case LOAD_PHASE_LOAD_DEPOT: + if (vdo_is_read_only(vdo->read_only_notifier)) { + /* + * In read-only mode we don't use the allocator and it + * may not even be readable, so don't bother trying to + * load it. + */ + vdo_set_operation_result(&vdo->admin_state, + VDO_READ_ONLY); + break; + } + + vdo_reset_admin_sub_task(completion); + if (requires_read_only_rebuild(vdo)) { + vdo_launch_rebuild(vdo, completion); + return; + } + + if (requires_rebuild(vdo)) { + vdo_launch_recovery(vdo, completion); + return; + } + + vdo_load_slab_depot(vdo->depot, + (was_new(vdo) + ? VDO_ADMIN_STATE_FORMATTING + : VDO_ADMIN_STATE_LOADING), + completion, + NULL); + return; + + case LOAD_PHASE_MAKE_DIRTY: + vdo_set_state(vdo, VDO_DIRTY); + vdo_save_components(vdo, vdo_reset_admin_sub_task(completion)); + return; + + case LOAD_PHASE_PREPARE_TO_ALLOCATE: + vdo_initialize_block_map_from_journal(vdo->block_map, + vdo->recovery_journal); + vdo_prepare_slab_depot_to_allocate(vdo->depot, + get_load_type(vdo), + vdo_reset_admin_sub_task(completion)); + return; + + case LOAD_PHASE_SCRUB_SLABS: + if (requires_recovery(vdo)) { + vdo_enter_recovery_mode(vdo); + } + + vdo_scrub_all_unrecovered_slabs(vdo->depot, + vdo_reset_admin_sub_task(completion)); + return; + + case LOAD_PHASE_DATA_REDUCTION: + WRITE_ONCE(vdo->compressing, vdo->device_config->compression); + if (vdo->device_config->deduplication) { + /* + * Don't try to load or rebuild the index first (and + * log scary error messages) if this is known to be a + * newly-formatted volume. + */ + vdo_start_dedupe_index(vdo->dedupe_index, + was_new(vdo)); + } + + vdo->allocations_allowed = false; + fallthrough; + + case LOAD_PHASE_FINISHED: + break; + + case LOAD_PHASE_DRAIN_JOURNAL: + vdo_drain_recovery_journal(vdo->recovery_journal, + VDO_ADMIN_STATE_SAVING, + vdo_reset_admin_sub_task(completion)); + return; + + case LOAD_PHASE_WAIT_FOR_READ_ONLY: + admin_completion->phase = LOAD_PHASE_FINISHED; + vdo_reset_admin_sub_task(completion); + vdo_wait_until_not_entering_read_only_mode(vdo->read_only_notifier, + completion); + return; + + default: + vdo_set_completion_result(vdo_reset_admin_sub_task(completion), + UDS_BAD_STATE); + } + + vdo_finish_operation(&vdo->admin_state, completion->result); +} + +/** + * handle_load_error() - Handle an error during the load operation. + * @completion: The sub-task completion. + * + * If at all possible, brings the vdo online in read-only mode. This handler + * is registered in vdo_load(). + */ +static void handle_load_error(struct vdo_completion *completion) +{ + struct admin_completion *admin_completion = + vdo_admin_completion_from_sub_task(completion); + struct vdo *vdo = vdo_from_load_sub_task(completion); + + vdo_assert_admin_operation_type(admin_completion, + VDO_ADMIN_OPERATION_LOAD); + + if (requires_read_only_rebuild(vdo) + && (admin_completion->phase == LOAD_PHASE_MAKE_DIRTY)) { + uds_log_error_strerror(completion->result, "aborting load"); + + /* Preserve the error. */ + vdo_set_operation_result(&vdo->admin_state, + completion->result); + admin_completion->phase = LOAD_PHASE_DRAIN_JOURNAL; + load_callback(UDS_FORGET(completion)); + return; + } + + uds_log_error_strerror(completion->result, + "Entering read-only mode due to load error"); + admin_completion->phase = LOAD_PHASE_WAIT_FOR_READ_ONLY; + vdo_enter_read_only_mode(vdo->read_only_notifier, completion->result); + vdo_set_operation_result(&vdo->admin_state, VDO_READ_ONLY); + load_callback(completion); +} + +/** + * vdo_load() - Load a vdo for normal operation. + * @vdo: The vdo to load. + * + * Context: This method must not be called from a base thread. + * + * Return: VDO_SUCCESS or an error. + */ +int vdo_load(struct vdo *vdo) +{ + const char *device_name; + int result; + + device_name = vdo_get_device_name(vdo->device_config->owning_target); + uds_log_info("starting device '%s'", device_name); + result = vdo_perform_admin_operation(vdo, + VDO_ADMIN_OPERATION_LOAD, + get_thread_id_for_phase, + load_callback, + handle_load_error); + + if ((result == VDO_SUCCESS) || (result == VDO_READ_ONLY)) { + /* + * Even if the VDO is read-only, it is now able to handle + * (read) requests. + */ + uds_log_info("device '%s' started", device_name); + return VDO_SUCCESS; + } + + /* + * Something has gone very wrong. Make sure everything has drained and + * leave the device in an unresumable state. + */ + uds_log_error_strerror(result, + "Start failed, could not load VDO metadata"); + vdo->suspend_type = VDO_ADMIN_STATE_STOPPING; + vdo_suspend(vdo); + return result; +} + +/** + * vdo_from_pre_load_sub_task() - Extract the vdo from an admin_completion, + * @completion: The admin_completion's sub-task completion. + * + * Checks that the current operation is a pre-load. + * + * Return: The vdo. + */ +static inline struct vdo * +vdo_from_pre_load_sub_task(struct vdo_completion *completion) +{ + return vdo_from_admin_sub_task(completion, + VDO_ADMIN_OPERATION_PRE_LOAD); +} + +/** + * decode_from_super_block() - Decode the VDO state from the super block and + * validate that it is correct. + * @vdo: The vdo being loaded. + * + * On error from this method, the component states must be destroyed + * explicitly. If this method returns successfully, the component states must + * not be destroyed. + * + * Return: VDO_SUCCESS or an error. + */ +static int __must_check decode_from_super_block(struct vdo *vdo) +{ + const struct device_config *config = vdo->device_config; + struct super_block_codec *codec + = vdo_get_super_block_codec(vdo->super_block); + int result = vdo_decode_component_states(codec->component_buffer, + vdo->geometry.release_version, + &vdo->states); + if (result != VDO_SUCCESS) { + return result; + } + + vdo_set_state(vdo, vdo->states.vdo.state); + vdo->load_state = vdo->states.vdo.state; + result = vdo_validate_component_states(&vdo->states, + vdo->geometry.nonce, + config->physical_blocks, + config->logical_blocks); + if (result != VDO_SUCCESS) { + return result; + } + + return vdo_decode_layout(vdo->states.layout, &vdo->layout); +} + +/** + * decode_vdo() - Decode the component data portion of a super block and fill + * in the corresponding portions of the vdo being loaded. + * @vdo: The vdo being loaded. + * + * This will also allocate the recovery journal and slab depot. If this method + * is called with an asynchronous layer (i.e. a thread config which specifies + * at least one base thread), the block map and packer will be constructed as + * well. + * + * Return: VDO_SUCCESS or an error. + */ +static int __must_check decode_vdo(struct vdo *vdo) +{ + block_count_t maximum_age, journal_length; + const struct thread_config *thread_config = vdo->thread_config; + int result = decode_from_super_block(vdo); + + if (result != VDO_SUCCESS) { + vdo_destroy_component_states(&vdo->states); + return result; + } + + maximum_age = vdo->device_config->block_map_maximum_age; + journal_length = + vdo_get_recovery_journal_length(vdo->states.vdo.config.recovery_journal_size); + if ((maximum_age > (journal_length / 2)) || (maximum_age < 1)) { + return VDO_BAD_CONFIGURATION; + } + + result = vdo_make_read_only_notifier(vdo_in_read_only_mode(vdo), + thread_config, + vdo, + &vdo->read_only_notifier); + if (result != VDO_SUCCESS) { + return result; + } + + result = vdo_enable_read_only_entry(vdo); + if (result != VDO_SUCCESS) { + return result; + } + + result = vdo_decode_recovery_journal(vdo->states.recovery_journal, + vdo->states.vdo.nonce, + vdo, + vdo_get_partition(vdo->layout, + VDO_RECOVERY_JOURNAL_PARTITION), + vdo->states.vdo.complete_recoveries, + vdo->states.vdo.config.recovery_journal_size, + VDO_RECOVERY_JOURNAL_TAIL_BUFFER_SIZE, + vdo->read_only_notifier, + thread_config, + &vdo->recovery_journal); + if (result != VDO_SUCCESS) { + return result; + } + + result = vdo_decode_slab_depot(vdo->states.slab_depot, + vdo, + vdo_get_partition(vdo->layout, + VDO_SLAB_SUMMARY_PARTITION), + &vdo->depot); + if (result != VDO_SUCCESS) { + return result; + } + + result = vdo_decode_block_map(vdo->states.block_map, + vdo->states.vdo.config.logical_blocks, + thread_config, + vdo, + vdo->read_only_notifier, + vdo->recovery_journal, + vdo->states.vdo.nonce, + vdo->device_config->cache_size, + maximum_age, + &vdo->block_map); + if (result != VDO_SUCCESS) { + return result; + } + + result = vdo_make_logical_zones(vdo, &vdo->logical_zones); + if (result != VDO_SUCCESS) { + return result; + } + + result = vdo_make_physical_zones(vdo, &vdo->physical_zones); + if (result != VDO_SUCCESS) { + return result; + } + + return vdo_make_hash_zones(vdo, &vdo->hash_zones); +} + +/** + * finish_operation_callback() - Callback to finish the load operation. + * @completion: The admin_completion's sub-task completion. + */ +static void finish_operation_callback(struct vdo_completion *completion) +{ + struct vdo *vdo = vdo_from_pre_load_sub_task(completion); + + vdo_finish_operation(&vdo->admin_state, completion->result); +} + +/** + * vdo_load_components() - Load the components of a VDO. + * @completion: The sub-task completion. + * + * This is the super block load callback set by load_callback(). + */ +static void vdo_load_components(struct vdo_completion *completion) +{ + struct vdo *vdo = vdo_from_pre_load_sub_task(completion); + + vdo_prepare_admin_sub_task(vdo, + finish_operation_callback, + finish_operation_callback); + vdo_finish_completion(completion, decode_vdo(vdo)); +} + +/** + * pre_load_callback() - Callback to initiate a pre-load, registered in + * vdo_prepare_to_load(). + * @completion: The sub-task completion. + */ +static void pre_load_callback(struct vdo_completion *completion) +{ + struct admin_completion *admin_completion = + vdo_admin_completion_from_sub_task(completion); + struct vdo *vdo = vdo_from_pre_load_sub_task(completion); + + vdo_assert_on_admin_thread(vdo, __func__); + if (!vdo_start_operation_with_waiter(&vdo->admin_state, + VDO_ADMIN_STATE_PRE_LOADING, + &admin_completion->completion, + NULL)) { + return; + } + + vdo_prepare_admin_sub_task(vdo, + vdo_load_components, + finish_operation_callback); + vdo_load_super_block(vdo, + completion, + vdo_get_data_region_start(vdo->geometry), + &vdo->super_block); +} + +/** + * vdo_prepare_to_load() - Perpare a vdo for loading by reading structures off + * disk. + * + * This method does not alter the on-disk state. It should be called from the + * vdo constructor, whereas perform_vdo_load() will be called during + * pre-resume if the vdo has not been resumed before. + */ +int vdo_prepare_to_load(struct vdo *vdo) +{ + return vdo_perform_admin_operation(vdo, + VDO_ADMIN_OPERATION_PRE_LOAD, + NULL, + pre_load_callback, + pre_load_callback); +} diff --git a/vdo/vdo-load.h b/vdo/vdo-load.h new file mode 100644 index 00000000..7df0b4fd --- /dev/null +++ b/vdo/vdo-load.h @@ -0,0 +1,16 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright Red Hat + */ + +#ifndef VDO_LOAD_H +#define VDO_LOAD_H + +#include "kernel-types.h" + +int __must_check vdo_load(struct vdo *vdo); + +int __must_check +vdo_prepare_to_load(struct vdo *vdo); + +#endif /* VDO_LOAD_H */ diff --git a/vdo/vdoPageCache.c b/vdo/vdo-page-cache.c similarity index 57% rename from vdo/vdoPageCache.c rename to vdo/vdo-page-cache.c index 564d64da..dd7a9be3 100644 --- a/vdo/vdoPageCache.c +++ b/vdo/vdo-page-cache.c @@ -1,38 +1,25 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/vdoPageCache.c#28 $ */ -#include "vdoPageCacheInternals.h" +#include "vdo-page-cache.h" +#include #include #include "errors.h" #include "logger.h" -#include "memoryAlloc.h" +#include "memory-alloc.h" #include "permassert.h" -#include "adminState.h" +#include "admin-state.h" +#include "block-map.h" #include "constants.h" -#include "numUtils.h" -#include "readOnlyNotifier.h" -#include "statusCodes.h" +#include "io-submitter.h" +#include "num-utils.h" +#include "read-only-notifier.h" +#include "status-codes.h" #include "types.h" #include "vdo.h" #include "vio.h" @@ -49,44 +36,43 @@ enum { */ #define ADD_ONCE(value, delta) WRITE_ONCE(value, (value) + (delta)) -/**********************************************************************/ +static inline bool is_dirty(const struct page_info *info) +{ + return info->state == PS_DIRTY; +} + static inline bool is_present(const struct page_info *info) { return (info->state == PS_RESIDENT) || (info->state == PS_DIRTY); } -/**********************************************************************/ static inline bool is_in_flight(const struct page_info *info) { return (info->state == PS_INCOMING) || (info->state == PS_OUTGOING); } -/**********************************************************************/ static inline bool is_incoming(const struct page_info *info) { return info->state == PS_INCOMING; } -/**********************************************************************/ static inline bool is_outgoing(const struct page_info *info) { return info->state == PS_OUTGOING; } -/**********************************************************************/ static inline bool is_valid(const struct page_info *info) { return is_present(info) || is_outgoing(info); } -/**********************************************************************/ static char *get_page_buffer(struct page_info *info) { struct vdo_page_cache *cache = info->cache; + return &cache->pages[(info - cache->infos) * VDO_BLOCK_SIZE]; } -/**********************************************************************/ static inline struct page_info * page_info_from_state_entry(struct list_head *entry) { @@ -96,7 +82,6 @@ page_info_from_state_entry(struct list_head *entry) return list_entry(entry, struct page_info, state_entry); } -/**********************************************************************/ static inline struct page_info * page_info_from_lru_entry(struct list_head *entry) { @@ -106,29 +91,37 @@ page_info_from_lru_entry(struct list_head *entry) return list_entry(entry, struct page_info, lru_entry); } -/**********************************************************************/ +static inline struct vdo_page_completion * +as_vdo_page_completion(struct vdo_completion *completion) +{ + vdo_assert_completion_type(completion->type, VDO_PAGE_COMPLETION); + return container_of(completion, struct vdo_page_completion, completion); +} + static inline struct vdo_page_completion * page_completion_from_waiter(struct waiter *waiter) { struct vdo_page_completion *completion; + if (waiter == NULL) { return NULL; } completion = container_of(waiter, struct vdo_page_completion, waiter); - assert_vdo_completion_type(completion->completion.type, + vdo_assert_completion_type(completion->completion.type, VDO_PAGE_COMPLETION); return completion; } /** - * Allocate components of the cache which require their own allocation. The - * caller is responsible for all clean up on errors. + * allocate_cache_components() - Allocate components of the cache which + * require their own allocation. + * @cache: The cache being constructed. * - * @param cache The cache being constructed + * The caller is responsible for all clean up on errors. * - * @return VDO_SUCCESS or an error code - **/ + * Return: VDO_SUCCESS or an error code. + */ static int __must_check allocate_cache_components(struct vdo_page_cache *cache) { uint64_t size = cache->page_count * (uint64_t) VDO_BLOCK_SIZE; @@ -151,19 +144,21 @@ static int __must_check allocate_cache_components(struct vdo_page_cache *cache) } /** - * Initialize all page info structures and put them on the free list. - * - * @param cache The cache to initialize + * initialize_info() - Initialize all page info structures and put them on the + * free list. + * @cache: The cache to initialize. * - * @return VDO_SUCCESS or an error - **/ + * Return: VDO_SUCCESS or an error. + */ static int initialize_info(struct vdo_page_cache *cache) { struct page_info *info; + INIT_LIST_HEAD(&cache->free_list); for (info = cache->infos; info < cache->infos + cache->page_count; ++info) { int result; + info->cache = cache; info->state = PS_FREE; info->pbn = NO_PAGE; @@ -177,7 +172,7 @@ static int initialize_info(struct vdo_page_cache *cache) return result; } - // The thread ID should never change. + /* The thread ID should never change. */ info->vio->completion.callback_thread_id = cache->zone->thread_id; @@ -189,11 +184,25 @@ static int initialize_info(struct vdo_page_cache *cache) return VDO_SUCCESS; } -/**********************************************************************/ static void write_dirty_pages_callback(struct list_head *entry, void *context); -/**********************************************************************/ -int make_vdo_page_cache(struct vdo *vdo, +/** + * vdo_make_page_cache() - Construct a page cache. + * @vdo: The vdo. + * @page_count: The number of cache pages to hold. + * @read_hook: The function to be called when a page is read into the cache. + * @write_hook: The function to be called after a page is written from the + * cache. + * @page_context_size: The size of the per-page context that will be passed to + * the read and write hooks. + * @maximum_age: The number of journal blocks before a dirtied page is + * considered old and must be written out. + * @zone: The block map zone which owns this cache. + * @cache_ptr: A pointer to hold the cache. + * + * Return: A success or error code. + */ +int vdo_make_page_cache(struct vdo *vdo, page_count_t page_count, vdo_page_read_function *read_hook, vdo_page_write_function *write_hook, @@ -225,24 +234,24 @@ int make_vdo_page_cache(struct vdo *vdo, result = allocate_cache_components(cache); if (result != VDO_SUCCESS) { - free_vdo_page_cache(cache); + vdo_free_page_cache(cache); return result; } result = initialize_info(cache); if (result != VDO_SUCCESS) { - free_vdo_page_cache(cache); + vdo_free_page_cache(cache); return result; } - result = make_vdo_dirty_lists(maximum_age, write_dirty_pages_callback, + result = vdo_make_dirty_lists(maximum_age, write_dirty_pages_callback, cache, &cache->dirty_lists); if (result != VDO_SUCCESS) { - free_vdo_page_cache(cache); + vdo_free_page_cache(cache); return result; } - // initialize empty circular queues + /* initialize empty circular queues */ INIT_LIST_HEAD(&cache->lru_list); INIT_LIST_HEAD(&cache->outgoing_list); @@ -250,8 +259,11 @@ int make_vdo_page_cache(struct vdo *vdo, return VDO_SUCCESS; } -/**********************************************************************/ -void free_vdo_page_cache(struct vdo_page_cache *cache) +/** + * vdo_free_page_cache() - Free the page cache structure. + * @cache: The cache to free. + */ +void vdo_free_page_cache(struct vdo_page_cache *cache) { if (cache == NULL) { return; @@ -259,6 +271,7 @@ void free_vdo_page_cache(struct vdo_page_cache *cache) if (cache->infos != NULL) { struct page_info *info; + for (info = cache->infos; info < cache->infos + cache->page_count; ++info) { @@ -273,30 +286,42 @@ void free_vdo_page_cache(struct vdo_page_cache *cache) UDS_FREE(cache); } -/**********************************************************************/ -void set_vdo_page_cache_initial_period(struct vdo_page_cache *cache, +/** + * vdo_set_page_cache_initial_period() - Set the initial dirty period for a + * page cache. + * @cache: The cache. + * @period: The initial dirty period to set. + */ +void vdo_set_page_cache_initial_period(struct vdo_page_cache *cache, sequence_number_t period) { - set_vdo_dirty_lists_current_period(cache->dirty_lists, period); + vdo_set_dirty_lists_current_period(cache->dirty_lists, period); } -/**********************************************************************/ -void set_vdo_page_cache_rebuild_mode(struct vdo_page_cache *cache, +/** + * vdo_set_page_cache_rebuild_mode() - Switch the page cache into or out of + * read-only rebuild mode. + * @cache: The cache. + * @rebuilding: true if the cache should be put into read-only rebuild mode, + * false otherwise. + */ +void vdo_set_page_cache_rebuild_mode(struct vdo_page_cache *cache, bool rebuilding) { cache->rebuilding = rebuilding; } /** - * Assert that a function has been called on the VDO page cache's thread. - * - * @param cache the page cache - * @param function_name the name of the function - **/ + * assert_on_cache_thread() - Assert that a function has been called on the + * VDO page cache's thread. + * @cache: The page cache. + * @function_name: The name of the function. + */ static inline void assert_on_cache_thread(struct vdo_page_cache *cache, const char *function_name) { thread_id_t thread_id = vdo_get_callback_thread_id(); + ASSERT_LOG_ONLY((thread_id == cache->zone->thread_id), "%s() must only be called on cache thread %d, not thread %d", function_name, @@ -305,21 +330,19 @@ static inline void assert_on_cache_thread(struct vdo_page_cache *cache, } /** - * Assert that a page cache may issue I/O. - * - * @param cache the page cache - **/ + * assert_io_allowed() - Assert that a page cache may issue I/O. + * @cache: The page cache. + */ static inline void assert_io_allowed(struct vdo_page_cache *cache) { - ASSERT_LOG_ONLY(!is_vdo_state_quiescent(&cache->zone->state), + ASSERT_LOG_ONLY(!vdo_is_state_quiescent(&cache->zone->state), "VDO page cache may issue I/O"); } /** - * Log and, if enabled, report cache pressure. - * - * @param cache the page cache - **/ + * report_cache_pressure() - Log and, if enabled, report cache pressure. + * @cache: The page cache. + */ static void report_cache_pressure(struct vdo_page_cache *cache) { ADD_ONCE(cache->stats.cache_pressure, 1); @@ -336,15 +359,14 @@ static void report_cache_pressure(struct vdo_page_cache *cache) } /** - * Return the name of a page state. - * - * @param state a page state + * get_page_state_name() - Return the name of a page state. + * @state: A page state. * - * @return a pointer to a static page state name + * If the page state is invalid a static string is returned and the invalid + * state is logged. * - * @note If the page state is invalid a static string is returned and the - * invalid state is logged. - **/ + * Return: A pointer to a static page state name. + */ static const char * __must_check get_page_state_name(enum vdo_page_buffer_state state) { @@ -352,9 +374,9 @@ get_page_state_name(enum vdo_page_buffer_state state) static const char *state_names[] = { "UDS_FREE", "INCOMING", "FAILED", "RESIDENT", "DIRTY", "OUTGOING" }; - STATIC_ASSERT(COUNT_OF(state_names) == PAGE_STATE_COUNT); + STATIC_ASSERT(ARRAY_SIZE(state_names) == PAGE_STATE_COUNT); - result = ASSERT(state < COUNT_OF(state_names), + result = ASSERT(state < ARRAY_SIZE(state_names), "Unknown page_state value %d", state); if (result != UDS_SUCCESS) { @@ -365,14 +387,14 @@ get_page_state_name(enum vdo_page_buffer_state state) } /** - * Update the counter associated with a given state. - * - * @param info the page info to count - * @param delta the delta to apply to the counter - **/ + * update_counter() - Update the counter associated with a given state. + * @info: The page info to count. + * @delta: The delta to apply to the counter. + */ static void update_counter(struct page_info *info, int32_t delta) { struct block_map_statistics *stats = &info->cache->stats; + switch (info->state) { case PS_FREE: ADD_ONCE(stats->free_pages, delta); @@ -404,8 +426,8 @@ static void update_counter(struct page_info *info, int32_t delta) } /** - * Update the lru information for an active page. - **/ + * update_lru() - Update the lru information for an active page. + */ static void update_lru(struct page_info *info) { struct vdo_page_cache *cache = info->cache; @@ -416,12 +438,11 @@ static void update_lru(struct page_info *info) } /** - * Set the state of a page_info and put it on the right list, adjusting - * counters. - * - * @param info the page_info to modify - * @param new_state the new state for the page_info - **/ + * set_info_state() - Set the state of a page_info and put it on the right + * list, adjusting counters. + * @info: The page_info to modify. + * @new_state: The new state for the page_info. + */ static void set_info_state(struct page_info *info, enum vdo_page_buffer_state new_state) { @@ -452,17 +473,16 @@ static void set_info_state(struct page_info *info, } /** - * Set the pbn for an info, updating the map as needed. - * - * @param info The page info - * @param pbn The physical block number to set - **/ + * set_info_pbn() - Set the pbn for an info, updating the map as needed. + * @info: The page info. + * @pbn: The physical block number to set. + */ static int __must_check set_info_pbn(struct page_info *info, physical_block_number_t pbn) { struct vdo_page_cache *cache = info->cache; - // Either the new or the old page number must be NO_PAGE. + /* Either the new or the old page number must be NO_PAGE. */ int result = ASSERT((pbn == NO_PAGE) || (info->pbn == NO_PAGE), "Must free a page before reusing it."); if (result != VDO_SUCCESS) { @@ -485,11 +505,12 @@ set_info_pbn(struct page_info *info, physical_block_number_t pbn) } /** - * Reset page info to represent an unallocated page. - **/ + * reset_page_info() - Reset page info to represent an unallocated page. + */ static int reset_page_info(struct page_info *info) { int result = ASSERT(info->busy == 0, "VDO Page must not be busy"); + if (result != UDS_SUCCESS) { return result; } @@ -507,16 +528,16 @@ static int reset_page_info(struct page_info *info) } /** - * Find a free page. + * find_free_page() - Find a free page. + * @cache: The page cache. * - * @param cache the page cache - * - * @return a pointer to the page info structure (if found), NULL otherwise - **/ + * Return: A pointer to the page info structure (if found), NULL otherwise. + */ static struct page_info * __must_check find_free_page(struct vdo_page_cache *cache) { struct page_info *info; + if (cache->free_list.next == &cache->free_list) { return NULL; } @@ -525,9 +546,15 @@ find_free_page(struct vdo_page_cache *cache) return info; } -/**********************************************************************/ -struct page_info *vdo_page_cache_find_page(struct vdo_page_cache *cache, - physical_block_number_t pbn) +/** + * find_page() - Find the page info (if any) associated with a given pbn. + * @cache: The page cache. + * @pbn: The absolute physical block number of the page. + * + * Return: The page info for the page if available, or NULL if not. + */ +static struct page_info * __must_check +find_page(struct vdo_page_cache *cache, physical_block_number_t pbn) { if ((cache->last_found != NULL) && (cache->last_found->pbn == pbn)) { return cache->last_found; @@ -537,26 +564,25 @@ struct page_info *vdo_page_cache_find_page(struct vdo_page_cache *cache, } /** - * Determine which page is least recently used. - * - * @param cache the page cache structure + * select_lru_page() - Determine which page is least recently used. + * @cache: The page cache structure. * - * @return a pointer to the info structure for a relevant page, - * or NULL if no such page can be found. The page can be - * dirty or resident. + * Picks the least recently used from among the non-busy entries at the front + * of each of the lru ring. Since whenever we mark a page busy we also put it + * to the end of the ring it is unlikely that the entries at the front are + * busy unless the queue is very short, but not impossible. * - * @note Picks the least recently used from among the non-busy entries - * at the front of each of the lru ring. - * Since whenever we mark a page busy we also put it to the end - * of the ring it is unlikely that the entries at the front - * are busy unless the queue is very short, but not impossible. - **/ + * Return: A pointer to the info structure for a relevant page, or NULL if no + * such page can be found. The page can be dirty or resident. + */ static struct page_info * __must_check select_lru_page(struct vdo_page_cache *cache) { struct list_head *lru; + list_for_each(lru, &cache->lru_list) { struct page_info *info = page_info_from_lru_entry(lru); + if ((info->busy == 0) && !is_in_flight(info)) { return info; } @@ -565,11 +591,17 @@ select_lru_page(struct vdo_page_cache *cache) return NULL; } -/**********************************************************************/ +/** + * vdo_get_page_cache_statistics() - Get current cache statistics. + * @cache: The page cache. + * + * Return: The statistics. + */ struct block_map_statistics -get_vdo_page_cache_statistics(const struct vdo_page_cache *cache) +vdo_get_page_cache_statistics(const struct vdo_page_cache *cache) { const struct block_map_statistics *stats = &cache->stats; + return (struct block_map_statistics) { .dirty_pages = READ_ONCE(stats->dirty_pages), .clean_pages = READ_ONCE(stats->clean_pages), @@ -595,14 +627,14 @@ get_vdo_page_cache_statistics(const struct vdo_page_cache *cache) }; } -// ASYNCHRONOUS INTERFACE BEYOND THIS POINT +/* ASYNCHRONOUS INTERFACE BEYOND THIS POINT */ /** - * Helper to complete the VDO Page Completion request successfully. - * - * @param info the page info representing the result page - * @param vdo_page_comp the VDO page completion to complete - **/ + * complete_with_page() - Helper to complete the VDO Page Completion request + * successfully. + * @info: The page info representing the result page. + * @vdo_page_comp: The VDO page completion to complete. + */ static void complete_with_page(struct page_info *info, struct vdo_page_completion *vdo_page_comp) { @@ -615,48 +647,51 @@ static void complete_with_page(struct page_info *info, get_page_state_name(info->state), vdo_page_comp->writable ? "present" : "valid"); - finish_vdo_completion(&vdo_page_comp->completion, VDO_BAD_PAGE); + vdo_finish_completion(&vdo_page_comp->completion, VDO_BAD_PAGE); return; } vdo_page_comp->info = info; vdo_page_comp->ready = true; - finish_vdo_completion(&vdo_page_comp->completion, VDO_SUCCESS); + vdo_finish_completion(&vdo_page_comp->completion, VDO_SUCCESS); } /** - * Complete a page completion with an error code. Implements waiter_callback. + * complete_waiter_with_error() - Complete a page completion with an error + * code. + * @waiter: The page completion, as a waiter. + * @result_ptr: A pointer to the error code. * - * @param waiter The page completion, as a waiter - * @param result_ptr A pointer to the error code. - **/ + * Implements waiter_callback. + */ static void complete_waiter_with_error(struct waiter *waiter, void *result_ptr) { int *result = result_ptr; struct vdo_page_completion *completion = page_completion_from_waiter(waiter); - finish_vdo_completion(&completion->completion, *result); + vdo_finish_completion(&completion->completion, *result); } /** - * Complete a queue of VDO page completions with an error code. + * distribute_error_over_queue() - Complete a queue of VDO page completions + * with an error code. + * @result: The error result. + * @queue: A pointer to the queue (in, out). * - * @param [in] result the error result - * @param [in, out] queue a pointer to the queue - * - * @note upon completion the queue will be empty - **/ + * Upon completion the queue will be empty. + */ static void distribute_error_over_queue(int result, struct wait_queue *queue) { notify_all_waiters(queue, complete_waiter_with_error, &result); } /** - * Complete a page completion with a page. Implements waiter_callback. + * complete_waiter_with_page() - Complete a page completion with a page. + * @waiter: The page completion, as a waiter. + * @page_info: The page info to complete with. * - * @param waiter The page completion, as a waiter - * @param page_info The page info to complete with - **/ + * Implements waiter_callback. + */ static void complete_waiter_with_page(struct waiter *waiter, void *page_info) { struct page_info *info = page_info; @@ -666,16 +701,15 @@ static void complete_waiter_with_page(struct waiter *waiter, void *page_info) } /** - * Complete a queue of VDO page completions with a page result. - * - * @param [in] info the page info describing the page - * @param [in, out] queue a pointer to a queue of waiters - * - * @return the number of pages distributed + * distribute_page_over_queue() - Complete a queue of VDO page completions + * with a page result. + * @info: The page info describing the page. + * @queue: A pointer to a queue of waiters (in, out). * - * @note upon completion the queue will be empty + * Upon completion the queue will be empty. * - **/ + * Return: The number of pages distributed. + */ static unsigned int distribute_page_over_queue(struct page_info *info, struct wait_queue *queue) { @@ -696,22 +730,23 @@ static unsigned int distribute_page_over_queue(struct page_info *info, } /** - * Set a persistent error which all requests will receive in the future. - * - * @param cache the page cache - * @param context a string describing what triggered the error - * @param result the error result + * set_persistent_error() - Set a persistent error which all requests will + * receive in the future. + * @cache: The page cache. + * @context: A string describing what triggered the error. + * @result: The error result. * - * Once triggered, all enqueued completions will get this error. - * Any future requests will result in this error as well. - **/ + * Once triggered, all enqueued completions will get this error. Any future + * requests will result in this error as well. + */ static void set_persistent_error(struct vdo_page_cache *cache, const char *context, int result) { struct page_info *info; - // If we're already read-only, there's no need to log. + /* If we're already read-only, there's no need to log. */ struct read_only_notifier *notifier = cache->zone->read_only_notifier; + if ((result != VDO_READ_ONLY) && !vdo_is_read_only(notifier)) { uds_log_error_strerror(result, "VDO Page Cache persistent error: %s", @@ -730,8 +765,22 @@ static void set_persistent_error(struct vdo_page_cache *cache, } } -/**********************************************************************/ -void init_vdo_page_completion(struct vdo_page_completion *page_completion, +/** + * vdo_init_page_completion() - Initialize a VDO Page Completion, requesting a + * particular page from the cache. + * @page_completion: The vdo_page_completion to initialize. + * @cache: The VDO page cache. + * @pbn: The absolute physical block of the desired page. + * @writable: Whether the page can be modified. + * @parent: The parent object. + * @callback: The completion callback. + * @error_handler: The handler for page errors. + * + * Once a completion has occurred for the vdo_get_page() operation, the + * underlying page shall be busy (stuck in memory) until the vdo_completion + * returned by this operation has been released. + */ +void vdo_init_page_completion(struct vdo_page_completion *page_completion, struct vdo_page_cache *cache, physical_block_number_t pbn, bool writable, @@ -740,6 +789,7 @@ void init_vdo_page_completion(struct vdo_page_completion *page_completion, vdo_action *error_handler) { struct vdo_completion *completion = &page_completion->completion; + ASSERT_LOG_ONLY((page_completion->waiter.next_waiter == NULL), "New page completion was not already on a wait queue"); @@ -749,8 +799,8 @@ void init_vdo_page_completion(struct vdo_page_completion *page_completion, .cache = cache, }; - initialize_vdo_completion(completion, cache->vdo, VDO_PAGE_COMPLETION); - prepare_vdo_completion(completion, + vdo_initialize_completion(completion, cache->vdo, VDO_PAGE_COMPLETION); + vdo_prepare_completion(completion, callback, error_handler, cache->zone->thread_id, @@ -758,20 +808,21 @@ void init_vdo_page_completion(struct vdo_page_completion *page_completion, } /** - * Helper function to check that a completion represents a successfully - * completed VDO Page Completion referring to a valid page. - * - * @param completion a VDO completion - * @param writable whether a writable page is required + * validate_completed_page() - Helper function to check that a completion + * represents a successfully completed VDO Page + * Completion referring to a valid page. + * @completion: A VDO completion. + * @writable: Whether a writable page is required. * - * @return the embedding completion if valid, NULL if not - **/ + * Return: The embedding completion if valid, NULL if not. + */ static struct vdo_page_completion * __must_check validate_completed_page(struct vdo_completion *completion, bool writable) { struct vdo_page_completion *vpc = as_vdo_page_completion(completion); int result = ASSERT(vpc->ready, "VDO Page completion not ready"); + if (result != UDS_SUCCESS) { return NULL; } @@ -805,23 +856,29 @@ validate_completed_page(struct vdo_completion *completion, bool writable) return vpc; } -/**********************************************************************/ -bool is_vdo_page_cache_active(struct vdo_page_cache *cache) +/** + * vdo_is_page_cache_active() - Check whether a page cache is active (i.e. has + * any active lookups, outstanding I/O, or + * pending I/O). + * @cache: The cache to check. + * + * Return: true if the cache is active. + */ +bool vdo_is_page_cache_active(struct vdo_page_cache *cache) { return ((cache->outstanding_reads != 0) || (cache->outstanding_writes != 0)); } /** - * vio callback used when a page has been loaded. - * - * @param completion A completion for the vio, the parent of which is a - * page_info. - **/ + * page_is_loaded() - Vio callback used when a page has been loaded. + * @completion: A completion for the vio, the parent of which is a page_info. + */ static void page_is_loaded(struct vdo_completion *completion) { struct page_info *info = completion->parent; struct vdo_page_cache *cache = info->cache; + assert_on_cache_thread(cache, __func__); set_info_state(info, PS_RESIDENT); @@ -829,25 +886,25 @@ static void page_is_loaded(struct vdo_completion *completion) /* * Don't decrement until right before calling - * vdo_check_for_drain_complete() to ensure that the above work can't - * cause the page cache to be freed out from under us. + * vdo_block_map_check_for_drain_complete() to ensure that the above + * work can't cause the page cache to be freed out from under us. */ cache->outstanding_reads--; - vdo_check_for_drain_complete(cache->zone); + vdo_block_map_check_for_drain_complete(cache->zone); } /** - * Handle page load errors. - * - * @param completion The page read vio - **/ + * handle_load_error() - Handle page load errors. + * @completion: The page read vio. + */ static void handle_load_error(struct vdo_completion *completion) { int result = completion->result; struct page_info *info = completion->parent; struct vdo_page_cache *cache = info->cache; - assert_on_cache_thread(cache, __func__); + assert_on_cache_thread(cache, __func__); + record_metadata_io_error(as_vio(completion)); vdo_enter_read_only_mode(cache->zone->read_only_notifier, result); ADD_ONCE(cache->stats.failed_reads, 1); set_info_state(info, PS_FAILED); @@ -856,49 +913,55 @@ static void handle_load_error(struct vdo_completion *completion) /* * Don't decrement until right before - * calling vdo_check_for_drain_complete() + * calling vdo_block_map_check_for_drain_complete() * to ensure that the above work can't cause the page cache to be freed * out from under us. */ cache->outstanding_reads--; - vdo_check_for_drain_complete(cache->zone); + vdo_block_map_check_for_drain_complete(cache->zone); } /** - * Run the read hook after a page is loaded. This callback is registered in - * launch_page_load() when there is a read hook. + * run_read_hook() - Run the read hook after a page is loaded. + * @completion: The page load completion. * - * @param completion The page load completion - **/ + * This callback is registered in launch_page_load() when there is a read + * hook. + */ static void run_read_hook(struct vdo_completion *completion) { int result; struct page_info *info = completion->parent; + completion->callback = page_is_loaded; - reset_vdo_completion(completion); + vdo_reset_completion(completion); result = info->cache->read_hook(get_page_buffer(info), info->pbn, info->cache->zone, info->context); - continue_vdo_completion(completion, result); + vdo_continue_completion(completion, result); } /** - * Handle a read error during a read-only rebuild. - * - * @param completion The page load completion - **/ + * handle_rebuild_read_error() - Handle a read error during a read-only + * rebuild. + * @completion: The page load completion. + */ static void handle_rebuild_read_error(struct vdo_completion *completion) { struct page_info *info = completion->parent; struct vdo_page_cache *cache = info->cache; + assert_on_cache_thread(cache, __func__); - // We are doing a read-only rebuild, so treat this as a successful read - // of an uninitialized page. + /* + * We are doing a read-only rebuild, so treat this as a successful read + * of an uninitialized page. + */ + record_metadata_io_error(as_vio(completion)); ADD_ONCE(cache->stats.failed_reads, 1); memset(get_page_buffer(info), 0, VDO_BLOCK_SIZE); - reset_vdo_completion(completion); + vdo_reset_completion(completion); if (cache->read_hook != NULL) { run_read_hook(completion); } else { @@ -906,19 +969,30 @@ static void handle_rebuild_read_error(struct vdo_completion *completion) } } +static void load_page_endio(struct bio *bio) +{ + struct vio *vio = bio->bi_private; + struct page_info *info = vio->completion.parent; + struct vdo_page_cache *cache = info->cache; + vdo_action *callback = + (cache->read_hook != NULL) ? run_read_hook : page_is_loaded; + + continue_vio_after_io(vio, callback, cache->zone->thread_id); +} + /** - * Begin the process of loading a page. - * - * @param info the page info representing where to load the page - * @param pbn the absolute pbn of the desired page + * launch_page_load() - Begin the process of loading a page. + * @info: The page info representing where to load the page. + * @pbn: The absolute pbn of the desired page. * - * @return VDO_SUCCESS or an error code - **/ + * Return: VDO_SUCCESS or an error code. + */ static int __must_check launch_page_load(struct page_info *info, physical_block_number_t pbn) { int result; struct vdo_page_cache *cache = info->cache; + assert_io_allowed(cache); result = set_info_pbn(info, pbn); @@ -934,37 +1008,48 @@ launch_page_load(struct page_info *info, physical_block_number_t pbn) set_info_state(info, PS_INCOMING); cache->outstanding_reads++; ADD_ONCE(cache->stats.pages_loaded, 1); - launch_read_metadata_vio(info->vio, - pbn, - (cache->read_hook != NULL) ? - run_read_hook : page_is_loaded, - (cache->rebuilding ? - handle_rebuild_read_error - : handle_load_error)); + submit_metadata_vio(info->vio, + pbn, + load_page_endio, + (cache->rebuilding ? + handle_rebuild_read_error : + handle_load_error), + REQ_OP_READ | REQ_PRIO); return VDO_SUCCESS; } -/**********************************************************************/ static void write_pages(struct vdo_completion *completion); /** - * Handle errors flushing the layer. - * - * @param completion The flush vio - **/ + * handle_flush_error() - Handle errors flushing the layer. + * @completion: The flush vio. + */ static void handle_flush_error(struct vdo_completion *completion) { struct vdo_page_cache *cache = ((struct page_info *) completion->parent)->cache; + + record_metadata_io_error(as_vio(completion)); set_persistent_error(cache, "flush failed", completion->result); write_pages(completion); } +static void flush_endio(struct bio *bio) +{ + struct vio *vio = bio->bi_private; + struct vdo_page_cache *cache = + ((struct page_info *) vio->completion.parent)->cache; + + continue_vio_after_io(vio, + write_pages, + cache->zone->thread_id); +} + /** - * Attempt to save the outgoing pages by first flushing the layer. - * - * @param cache The cache - **/ + * save_pages() - Attempt to save the outgoing pages by first flushing the + * layer. + * @cache: The cache. + */ static void save_pages(struct vdo_page_cache *cache) { struct page_info *info; @@ -988,15 +1073,16 @@ static void save_pages(struct vdo_page_cache *cache) * these pages were successfully persisted, and thus must issue a flush * before each batch of pages is written to ensure this. */ - launch_flush_vio(vio, write_pages, handle_flush_error); + submit_flush_vio(vio, flush_endio, handle_flush_error); } /** - * Add a page to the outgoing list of pages waiting to be saved. Once in the - * list, a page may not be used until it has been written out. + * schedule_page_save() - Add a page to the outgoing list of pages waiting to + * be saved. + * @info: The page to save. * - * @param info The page to save - **/ + * Once in the list, a page may not be used until it has been written out. + */ static void schedule_page_save(struct page_info *info) { if (info->busy > 0) { @@ -1009,12 +1095,12 @@ static void schedule_page_save(struct page_info *info) set_info_state(info, PS_OUTGOING); } -/**********************************************************************/ static void write_dirty_pages_callback(struct list_head *expired, void *context) { while (!list_empty(expired)) { struct list_head *entry = expired->next; + list_del_init(entry); schedule_page_save(page_info_from_state_entry(entry)); } @@ -1023,11 +1109,11 @@ static void write_dirty_pages_callback(struct list_head *expired, } /** - * Add a page to outgoing pages waiting to be saved, and then start saving - * pages if another save is not in progress. - * - * @param info The page to save - **/ + * launch_page_save() - Add a page to outgoing pages waiting to be saved, and + * then start saving pages if another save is not in + * progress. + * @info: The page to save. + */ static void launch_page_save(struct page_info *info) { schedule_page_save(info); @@ -1035,30 +1121,34 @@ static void launch_page_save(struct page_info *info) } /** - * Determine whether a given vdo_page_completion (as a waiter) is requesting a - * given page number. Implements waiter_match. + * completion_needs_page() - Determine whether a given vdo_page_completion (as + * a waiter) is requesting a given page number. + * @waiter: The page completion in question. + * @context: A pointer to the pbn of the desired page. * - * @param waiter The page completion in question - * @param context A pointer to the pbn of the desired page + * Implements waiter_match. * - * @return true if the page completion is for the desired page number - **/ + * Return: true if the page completion is for the desired page number. + */ static bool completion_needs_page(struct waiter *waiter, void *context) { physical_block_number_t *pbn = context; + return (page_completion_from_waiter(waiter)->pbn == *pbn); } /** - * Allocate a free page to the first completion in the waiting queue, - * and any other completions that match it in page number. - **/ + * allocate_free_page() - Allocate a free page to the first completion in the + * waiting queue, and any other completions that match + * it in page number. + */ static void allocate_free_page(struct page_info *info) { int result; struct waiter *oldest_waiter; physical_block_number_t pbn; struct vdo_page_cache *cache = info->cache; + assert_on_cache_thread(cache, __func__); if (!has_waiters(&cache->free_waiters)) { @@ -1078,8 +1168,10 @@ static void allocate_free_page(struct page_info *info) oldest_waiter = get_first_waiter(&cache->free_waiters); pbn = page_completion_from_waiter(oldest_waiter)->pbn; - // Remove all entries which match the page number in question - // and push them onto the page info's wait queue. + /* + * Remove all entries which match the page number in question + * and push them onto the page info's wait queue. + */ dequeue_matching_waiters(&cache->free_waiters, completion_needs_page, &pbn, &info->waiting); cache->waiter_count -= count_waiters(&info->waiting); @@ -1091,27 +1183,26 @@ static void allocate_free_page(struct page_info *info) } /** - * Begin the process of discarding a page. - * - * @param cache the page cache + * discard_a_page() - Begin the process of discarding a page. + * @cache: The page cache. * - * @note If no page is discardable, increments a count of deferred frees so - * that the next release of a page which is no longer busy will kick - * off another discard cycle. This is an indication that the cache is - * not big enough. + * If no page is discardable, increments a count of deferred frees so that the + * next release of a page which is no longer busy will kick off another + * discard cycle. This is an indication that the cache is not big enough. * - * @note If the selected page is not dirty, immediately allocates the page - * to the oldest completion waiting for a free page. - **/ + * If the selected page is not dirty, immediately allocates the page to the + * oldest completion waiting for a free page. + */ static void discard_a_page(struct vdo_page_cache *cache) { struct page_info *info = select_lru_page(cache); + if (info == NULL) { report_cache_pressure(cache); return; } - if (!is_vdo_page_dirty(info)) { + if (!is_dirty(info)) { allocate_free_page(info); return; } @@ -1125,11 +1216,10 @@ static void discard_a_page(struct vdo_page_cache *cache) } /** - * Helper used to trigger a discard so that the completion can get a different - * page. - * - * @param vdo_page_comp the VDO Page completion - **/ + * discard_page_for_completion() - Helper used to trigger a discard so that + * the completion can get a different page. + * @vdo_page_comp: The VDO Page completion. + */ static void discard_page_for_completion(struct vdo_page_completion *vdo_page_comp) { @@ -1147,10 +1237,10 @@ discard_page_for_completion(struct vdo_page_completion *vdo_page_comp) } /** - * Helper used to trigger a discard if the cache needs another free page. - * - * @param cache the page cache - **/ + * discard_page_if_needed() - Helper used to trigger a discard if the cache + * needs another free page. + * @cache: The page cache. + */ static void discard_page_if_needed(struct vdo_page_cache *cache) { if (cache->waiter_count > cache->discard_count) { @@ -1158,24 +1248,30 @@ static void discard_page_if_needed(struct vdo_page_cache *cache) } } -/**********************************************************************/ -void advance_vdo_page_cache_period(struct vdo_page_cache *cache, +/** + * vdo_advance_page_cache_period() - Advance the dirty period for a page + * cache. + * @cache: The cache to advance. + * @period: The new dirty period. + */ +void vdo_advance_page_cache_period(struct vdo_page_cache *cache, sequence_number_t period) { assert_on_cache_thread(cache, __func__); - advance_vdo_dirty_lists_period(cache->dirty_lists, period); + vdo_advance_dirty_lists_period(cache->dirty_lists, period); } /** - * Inform the cache that a write has finished (possibly with an error). - * - * @param info The info structure for the page whose write just completed + * write_has_finished() - Inform the cache that a write has finished (possibly + * with an error). + * @info: The info structure for the page whose write just completed. * - * @return true if the page write was a discard - **/ + * Return: true if the page write was a discard. + */ static bool write_has_finished(struct page_info *info) { bool was_discard = (info->write_status == WRITE_STATUS_DISCARD); + assert_on_cache_thread(info->cache, __func__); info->cache->outstanding_writes--; @@ -1184,17 +1280,18 @@ static bool write_has_finished(struct page_info *info) } /** - * Handler for page write errors. - * - * @param completion The page write vio - **/ + * handle_page_write_error() - Handler for page write errors. + * @completion: The page write vio. + */ static void handle_page_write_error(struct vdo_completion *completion) { int result = completion->result; struct page_info *info = completion->parent; struct vdo_page_cache *cache = info->cache; - // If we're already read-only, write failures are to be expected. + record_metadata_io_error(as_vio(completion)); + + /* If we're already read-only, write failures are to be expected. */ if (result != VDO_READ_ONLY) { static DEFINE_RATELIMIT_STATE(error_limiter, DEFAULT_RATELIMIT_INTERVAL, @@ -1214,15 +1311,25 @@ static void handle_page_write_error(struct vdo_completion *completion) discard_page_if_needed(cache); } - vdo_check_for_drain_complete(cache->zone); + vdo_block_map_check_for_drain_complete(cache->zone); +} + +static void page_is_written_out(struct vdo_completion *completion); + +static void write_page_endio(struct bio *bio) +{ + struct vio *vio = bio->bi_private; + struct page_info *info = vio->completion.parent; + struct vdo_page_cache *cache = info->cache; + + continue_vio_after_io(vio, page_is_written_out, cache->zone->thread_id); } /** - * vio callback used when a page has been written out. - * - * @param completion A completion for the vio, the parent of which - * is embedded in page_info. - **/ + * page_is_written_out() - Vio callback used when a page has been written out. + * @completion: A completion for the vio, the parent of which is embedded in + * page_info. + */ static void page_is_written_out(struct vdo_completion *completion) { bool was_discard, reclaimed; @@ -1233,14 +1340,16 @@ static void page_is_written_out(struct vdo_completion *completion) if (cache->write_hook != NULL) { bool rewrite = cache->write_hook(get_page_buffer(info), - cache->zone, info->context); + cache->zone, + info->context); if (rewrite) { - launch_write_metadata_vio_with_flush(info->vio, - info->pbn, - page_is_written_out, - handle_page_write_error, - true, - false); + submit_metadata_vio(info->vio, + info->pbn, + write_page_endio, + handle_page_write_error, + (REQ_OP_WRITE | + REQ_PRIO | + REQ_PREFLUSH)); return; } } @@ -1264,15 +1373,16 @@ static void page_is_written_out(struct vdo_completion *completion) allocate_free_page(info); } - vdo_check_for_drain_complete(cache->zone); + vdo_block_map_check_for_drain_complete(cache->zone); } /** - * Write the batch of pages which were covered by the layer flush which just - * completed. This callback is registered in save_pages(). + * write_pages() - Write the batch of pages which were covered by the layer + * flush which just completed. + * @flush_completion: The flush vio. * - * @param flush_completion The flush vio - **/ + * This callback is registered in save_pages(). + */ static void write_pages(struct vdo_completion *flush_completion) { struct vdo_page_cache *cache = @@ -1286,40 +1396,53 @@ static void write_pages(struct vdo_completion *flush_completion) */ bool has_unflushed_pages = (cache->pages_to_flush > 0); page_count_t pages_in_flush = cache->pages_in_flush; + cache->pages_in_flush = 0; while (pages_in_flush-- > 0) { struct list_head *entry = cache->outgoing_list.next; struct page_info *info = page_info_from_state_entry(entry); + list_del_init(entry); if (vdo_is_read_only(info->cache->zone->read_only_notifier)) { struct vdo_completion *completion = &info->vio->completion; - reset_vdo_completion(completion); + vdo_reset_completion(completion); completion->callback = page_is_written_out; completion->error_handler = handle_page_write_error; - finish_vdo_completion(completion, VDO_READ_ONLY); + vdo_finish_completion(completion, VDO_READ_ONLY); continue; } ADD_ONCE(info->cache->stats.pages_saved, 1); - launch_write_metadata_vio(info->vio, - info->pbn, - page_is_written_out, - handle_page_write_error); + submit_metadata_vio(info->vio, + info->pbn, + write_page_endio, + handle_page_write_error, + REQ_OP_WRITE | REQ_PRIO); } if (has_unflushed_pages) { - // If there are unflushed pages, the cache can't have been - // freed, so this call is safe. + /* + * If there are unflushed pages, the cache can't have been + * freed, so this call is safe. + */ save_pages(cache); } } -/**********************************************************************/ -void release_vdo_page_completion(struct vdo_completion *completion) +/** + * vdo_release_page_completion() - Release a VDO Page Completion. + * @completion: The completion to release. + * + * The page referenced by this completion (if any) will no longer be held busy + * by this completion. If a page becomes discardable and there are completions + * awaiting free pages then a new round of page discarding is started. + */ +void vdo_release_page_completion(struct vdo_completion *completion) { struct page_info *discard_info = NULL; struct vdo_page_completion *page_completion; struct vdo_page_cache *cache; + if (completion == NULL) { return; } @@ -1330,7 +1453,7 @@ void release_vdo_page_completion(struct vdo_completion *completion) discard_info = page_completion->info; } } else { - // Do not check for errors if the completion was not successful. + /* Do not check for errors if the completion was not successful. */ page_completion = as_vdo_page_completion(completion); } ASSERT_LOG_ONLY((page_completion->waiter.next_waiter == NULL), @@ -1345,25 +1468,28 @@ void release_vdo_page_completion(struct vdo_completion *completion) discard_info->write_status = WRITE_STATUS_NORMAL; launch_page_save(discard_info); } - // if there are excess requests for pages (that have not already - // started discards) we need to discard some page (which may be - // this one) + /* + * if there are excess requests for pages (that have not already + * started discards) we need to discard some page (which may be + * this one) + */ discard_page_if_needed(cache); } } /** - * Helper function to load a page as described by a VDO Page Completion. - * - * @param info the page info representing where to load the page - * @param vdo_page_comp the VDO Page Completion describing the page - **/ + * load_page_for_completion() - Helper function to load a page as described by + * a VDO Page Completion. + * @info: The page info representing where to load the page. + * @vdo_page_comp: The VDO Page Completion describing the page. + */ static void load_page_for_completion(struct page_info *info, struct vdo_page_completion *vdo_page_comp) { int result = enqueue_waiter(&info->waiting, &vdo_page_comp->waiter); + if (result != VDO_SUCCESS) { - finish_vdo_completion(&vdo_page_comp->completion, result); + vdo_finish_completion(&vdo_page_comp->completion, result); return; } @@ -1373,19 +1499,30 @@ static void load_page_for_completion(struct page_info *info, } } -/**********************************************************************/ -void get_vdo_page(struct vdo_completion *completion) +/** + * vdo_get_page() - Asynchronous operation to get a VDO page. + * @completion: The completion initialized by vdo_init_page_completion(). + * + * May cause another page to be discarded (potentially writing a dirty page) + * and the one nominated by the completion to be loaded from disk. + * + * When the page becomes available the callback registered in the completion + * provided is triggered. Once triggered the page is marked busy until the + * completion is destroyed. + */ +void vdo_get_page(struct vdo_completion *completion) { struct page_info *info; struct vdo_page_completion *vdo_page_comp = as_vdo_page_completion(completion); struct vdo_page_cache *cache = vdo_page_comp->cache; + assert_on_cache_thread(cache, __func__); if (vdo_page_comp->writable && vdo_is_read_only(cache->zone->read_only_notifier)) { - finish_vdo_completion(completion, VDO_READ_ONLY); + vdo_finish_completion(completion, VDO_READ_ONLY); return; } @@ -1395,19 +1532,19 @@ void get_vdo_page(struct vdo_completion *completion) ADD_ONCE(cache->stats.read_count, 1); } - info = vdo_page_cache_find_page(cache, vdo_page_comp->pbn); + info = find_page(cache, vdo_page_comp->pbn); if (info != NULL) { - // The page is in the cache already. + /* The page is in the cache already. */ if ((info->write_status == WRITE_STATUS_DEFERRED) || is_incoming(info) || (is_outgoing(info) && vdo_page_comp->writable)) { int result; - // The page is unusable until it has finished I/O. + /* The page is unusable until it has finished I/O. */ ADD_ONCE(cache->stats.wait_for_page, 1); result = enqueue_waiter(&info->waiting, &vdo_page_comp->waiter); if (result != VDO_SUCCESS) { - finish_vdo_completion(&vdo_page_comp->completion, + vdo_finish_completion(&vdo_page_comp->completion, result); } @@ -1415,7 +1552,7 @@ void get_vdo_page(struct vdo_completion *completion) } if (is_valid(info)) { - // The page is usable. + /* The page is usable. */ ADD_ONCE(cache->stats.found_in_cache, 1); if (!is_present(info)) { ADD_ONCE(cache->stats.read_outgoing, 1); @@ -1425,11 +1562,11 @@ void get_vdo_page(struct vdo_completion *completion) complete_with_page(info, vdo_page_comp); return; } - // Something horrible has gone wrong. + /* Something horrible has gone wrong. */ ASSERT_LOG_ONLY(false, "Info found in a usable state."); } - // The page must be fetched. + /* The page must be fetched. */ info = find_free_page(cache); if (info != NULL) { ADD_ONCE(cache->stats.fetch_required, 1); @@ -1437,13 +1574,20 @@ void get_vdo_page(struct vdo_completion *completion) return; } - // The page must wait for a page to be discarded. + /* The page must wait for a page to be discarded. */ ADD_ONCE(cache->stats.discard_required, 1); discard_page_for_completion(vdo_page_comp); } -/**********************************************************************/ -void mark_completed_vdo_page_dirty(struct vdo_completion *completion, +/** + * vdo_mark_completed_page_dirty() - Mark a VDO page referenced by a completed + * vdo_page_completion as dirty. + * @completion: A VDO Page Completion whose callback has been called. + * @old_dirty_period: The period in which the page was already dirty (0 if it + * wasn't). + * @new_dirty_period: The period in which the page is now dirty. + */ +void vdo_mark_completed_page_dirty(struct vdo_completion *completion, sequence_number_t old_dirty_period, sequence_number_t new_dirty_period) { @@ -1457,14 +1601,18 @@ void mark_completed_vdo_page_dirty(struct vdo_completion *completion, info = vdo_page_comp->info; set_info_state(info, PS_DIRTY); - add_to_vdo_dirty_lists(info->cache->dirty_lists, + vdo_add_to_dirty_lists(info->cache->dirty_lists, &info->state_entry, old_dirty_period, new_dirty_period); } -/**********************************************************************/ -void request_vdo_page_write(struct vdo_completion *completion) +/** + * vdo_request_page_write() - Request that a VDO page be written out as soon + * as it is not busy. + * @completion: The vdo_page_completion containing the page. + */ +void vdo_request_page_write(struct vdo_completion *completion) { struct page_info *info; @@ -1479,28 +1627,52 @@ void request_vdo_page_write(struct vdo_completion *completion) launch_page_save(info); } -/**********************************************************************/ static void *dereference_page_completion(struct vdo_page_completion *completion) { return ((completion != NULL) ? get_page_buffer(completion->info) : NULL); } -/**********************************************************************/ -const void *dereference_readable_vdo_page(struct vdo_completion *completion) +/** + * vdo_dereference_readable_page() - Access the raw memory for a read-only + * page of a completed vdo_page_completion. + * @completion: A vdo page completion whose callback has been called. + * + * Return: A pointer to the raw memory at the beginning of the page, or + * NULL if the page is not available. + */ +const void *vdo_dereference_readable_page(struct vdo_completion *completion) { return dereference_page_completion( validate_completed_page(completion, false)); } -/**********************************************************************/ -void *dereference_writable_vdo_page(struct vdo_completion *completion) +/** + * vdo_dereference_writable_page() - Access the raw memory for a writable page + * of a completed vdo_page_completion. + * @completion: A vdo page completion whose callback has been called. + * + * Return: A pointer to the raw memory at the beginning of the page, or + * NULL if the page is not available, or if the page is read-only. + */ +void *vdo_dereference_writable_page(struct vdo_completion *completion) { return dereference_page_completion(validate_completed_page(completion, true)); } -/**********************************************************************/ -void *get_vdo_page_completion_context(struct vdo_completion *completion) +/** + * vdo_get_page_completion_context() - Get the per-page client context for the + * page in a page completion whose + * callback has been invoked. + * @completion: A vdo page completion whose callback has been invoked. + * + * Should only be called after dereferencing the page completion to validate + * the page. + * + * Return: A pointer to the per-page client context, or NULL if + * the page is not available. + */ +void *vdo_get_page_completion_context(struct vdo_completion *completion) { struct vdo_page_completion *page_completion = as_vdo_page_completion(completion); @@ -1509,36 +1681,47 @@ void *get_vdo_page_completion_context(struct vdo_completion *completion) return (((info != NULL) && is_valid(info)) ? info->context : NULL); } -/**********************************************************************/ -void drain_vdo_page_cache(struct vdo_page_cache *cache) +/** + * vdo_drain_page_cache() - Drain I/O for a page cache. + * @cache: The cache to drain. + */ +void vdo_drain_page_cache(struct vdo_page_cache *cache) { assert_on_cache_thread(cache, __func__); - ASSERT_LOG_ONLY(is_vdo_state_draining(&cache->zone->state), - "drain_vdo_page_cache() called during block map drain"); + ASSERT_LOG_ONLY(vdo_is_state_draining(&cache->zone->state), + "vdo_drain_page_cache() called during block map drain"); - if (!is_vdo_state_suspending(&cache->zone->state)) { - flush_vdo_dirty_lists(cache->dirty_lists); + if (!vdo_is_state_suspending(&cache->zone->state)) { + vdo_flush_dirty_lists(cache->dirty_lists); save_pages(cache); } } -/**********************************************************************/ -int invalidate_vdo_page_cache(struct vdo_page_cache *cache) +/** + * vdo_invalidate_page_cache() - Invalidate all entries in the VDO page cache. + * @cache: The cache to invalidate. + * + * There must not be any dirty pages in the cache. + * + * Return: A success or error code. + */ +int vdo_invalidate_page_cache(struct vdo_page_cache *cache) { struct page_info *info; + assert_on_cache_thread(cache, __func__); - // Make sure we don't throw away any dirty pages. + /* Make sure we don't throw away any dirty pages. */ for (info = cache->infos; info < cache->infos + cache->page_count; info++) { - int result = ASSERT(!is_vdo_page_dirty(info), + int result = ASSERT(!is_dirty(info), "cache must have no dirty pages"); if (result != VDO_SUCCESS) { return result; } } - // Reset the page map by re-allocating it. + /* Reset the page map by re-allocating it. */ free_int_map(UDS_FORGET(cache->page_map)); return make_int_map(cache->page_count, 0, &cache->page_map); } diff --git a/vdo/vdo-page-cache.h b/vdo/vdo-page-cache.h new file mode 100644 index 00000000..3a314cb8 --- /dev/null +++ b/vdo/vdo-page-cache.h @@ -0,0 +1,262 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright Red Hat + */ + +#ifndef VDO_PAGE_CACHE_H +#define VDO_PAGE_CACHE_H + +#include + + +#include "admin-state.h" +#include "completion.h" +#include "dirty-lists.h" +#include "int-map.h" +#include "statistics.h" +#include "types.h" +#include "wait-queue.h" + +/* + * Generation counter for page references. + */ +typedef uint32_t vdo_page_generation; + +/** + * typedef vdo_page_read_function - Signature for a function to call when a + * page is read into the cache. + * @raw_page: The raw memory of the freshly-fetched page. + * @pbn: The absolute physical block number of the page. + * @zone: The block map zone to which the cache belongs. + * @page_context: A pointer to client-specific data for the new page. + * + * If specified, this function is called when a page is fetched from disk. + * + * Return: VDO_SUCCESS on success or VDO_BAD_PAGE if the page is incorrectly + * formatted. + */ +typedef int vdo_page_read_function(void *raw_page, + physical_block_number_t pbn, + struct block_map_zone *zone, + void *page_context); + +/** + * typedef vdo_page_write_function - Signature for a function to call when a + * page is written from the cache. + * @raw_page: The raw memory of the freshly-written page. + * @zone: The block map zone to which the cache belongs. + * @page_context: A pointer to client-specific data for the new page. + * + * If specified, this function is called when a page is written to disk. + * + * Return: Whether the page needs to be rewritten. + */ +typedef bool vdo_page_write_function(void *raw_page, + struct block_map_zone *zone, + void *page_context); + +enum { + MAX_PAGE_CONTEXT_SIZE = 8, +}; + +static const physical_block_number_t NO_PAGE = 0xFFFFFFFFFFFFFFFF; + +/* + * The VDO Page Cache abstraction. + */ +struct vdo_page_cache { + /* the VDO which owns this cache */ + struct vdo *vdo; + /* number of pages in cache */ + page_count_t page_count; + /* function to call on page read */ + vdo_page_read_function *read_hook; + /* function to call on page write */ + vdo_page_write_function *write_hook; + /* number of pages to write in the current batch */ + page_count_t pages_in_batch; + /* Whether the VDO is doing a read-only rebuild */ + bool rebuilding; + + /* array of page information entries */ + struct page_info *infos; + /* raw memory for pages */ + char *pages; + /* cache last found page info */ + struct page_info *last_found; + /* map of page number to info */ + struct int_map *page_map; + /* main LRU list (all infos) */ + struct list_head lru_list; + /* dirty pages by period */ + struct dirty_lists *dirty_lists; + /* free page list (oldest first) */ + struct list_head free_list; + /* outgoing page list */ + struct list_head outgoing_list; + /* number of read I/O operations pending */ + page_count_t outstanding_reads; + /* number of write I/O operations pending */ + page_count_t outstanding_writes; + /* number of pages covered by the current flush */ + page_count_t pages_in_flush; + /* number of pages waiting to be included in the next flush */ + page_count_t pages_to_flush; + /* number of discards in progress */ + unsigned int discard_count; + /* how many VPCs waiting for free page */ + unsigned int waiter_count; + /* queue of waiters who want a free page */ + struct wait_queue free_waiters; + /* + * Statistics are only updated on the logical zone thread, but are + * accessed from other threads. + */ + struct block_map_statistics stats; + /* counter for pressure reports */ + uint32_t pressure_report; + /* the block map zone to which this cache belongs */ + struct block_map_zone *zone; +}; + +/* + * The state of a page buffer. If the page buffer is free no particular page is + * bound to it, otherwise the page buffer is bound to particular page whose + * absolute pbn is in the pbn field. If the page is resident or dirty the page + * data is stable and may be accessed. Otherwise the page is in flight + * (incoming or outgoing) and its data should not be accessed. + * + * @note Update the static data in get_page_state_name() if you change this + * enumeration. + */ +enum vdo_page_buffer_state { + /* this page buffer is not being used */ + PS_FREE, + /* this page is being read from store */ + PS_INCOMING, + /* attempt to load this page failed */ + PS_FAILED, + /* this page is valid and un-modified */ + PS_RESIDENT, + /* this page is valid and modified */ + PS_DIRTY, + /* this page is being written and should not be used */ + PS_OUTGOING, + /* not a state */ + PAGE_STATE_COUNT, +} __packed; + +/* + * The write status of page + */ +enum vdo_page_write_status { + WRITE_STATUS_NORMAL, + WRITE_STATUS_DISCARD, + WRITE_STATUS_DEFERRED, +} __packed; + +/* + * Per-page-slot information. + */ +struct page_info { + /* Preallocated page struct vio */ + struct vio *vio; + /* back-link for references */ + struct vdo_page_cache *cache; + /* the pbn of the page */ + physical_block_number_t pbn; + /* page is busy (temporarily locked) */ + uint16_t busy; + /* the write status the page */ + enum vdo_page_write_status write_status; + /* page state */ + enum vdo_page_buffer_state state; + /* queue of completions awaiting this item */ + struct wait_queue waiting; + /* state linked list entry */ + struct list_head state_entry; + /* LRU entry */ + struct list_head lru_entry; + /* Space for per-page client data */ + byte context[MAX_PAGE_CONTEXT_SIZE]; +}; + +int __must_check vdo_make_page_cache(struct vdo *vdo, + page_count_t page_count, + vdo_page_read_function *read_hook, + vdo_page_write_function *write_hook, + size_t page_context_size, + block_count_t maximum_age, + struct block_map_zone *zone, + struct vdo_page_cache **cache_ptr); + +void vdo_free_page_cache(struct vdo_page_cache *cache); + +void vdo_set_page_cache_initial_period(struct vdo_page_cache *cache, + sequence_number_t period); + +void vdo_set_page_cache_rebuild_mode(struct vdo_page_cache *cache, + bool rebuilding); + +bool __must_check vdo_is_page_cache_active(struct vdo_page_cache *cache); + +void vdo_advance_page_cache_period(struct vdo_page_cache *cache, + sequence_number_t period); + +/* ASYNC */ + +/* + * A completion awaiting a specific page. Also a live reference into the + * page once completed, until freed. + */ +struct vdo_page_completion { + /* The generic completion */ + struct vdo_completion completion; + /* The cache involved */ + struct vdo_page_cache *cache; + /* The waiter for the pending list */ + struct waiter waiter; + /* The absolute physical block number of the page on disk */ + physical_block_number_t pbn; + /* Whether the page may be modified */ + bool writable; + /* Whether the page is available */ + bool ready; + /* The info structure for the page, only valid when ready */ + struct page_info *info; +}; + +void vdo_init_page_completion(struct vdo_page_completion *page_completion, + struct vdo_page_cache *cache, + physical_block_number_t pbn, + bool writable, + void *parent, + vdo_action *callback, + vdo_action *error_handler); + +void vdo_release_page_completion(struct vdo_completion *completion); + +void vdo_get_page(struct vdo_completion *completion); + +void vdo_mark_completed_page_dirty(struct vdo_completion *completion, + sequence_number_t old_dirty_period, + sequence_number_t new_dirty_period); + +void vdo_request_page_write(struct vdo_completion *completion); + +const void *vdo_dereference_readable_page(struct vdo_completion *completion); + +void *vdo_dereference_writable_page(struct vdo_completion *completion); + +void *vdo_get_page_completion_context(struct vdo_completion *completion); + +void vdo_drain_page_cache(struct vdo_page_cache *cache); + +int __must_check vdo_invalidate_page_cache(struct vdo_page_cache *cache); + +/* STATISTICS & TESTING */ + +struct block_map_statistics __must_check +vdo_get_page_cache_statistics(const struct vdo_page_cache *cache); + +#endif /* VDO_PAGE_CACHE_H */ diff --git a/vdo/vdoRecovery.c b/vdo/vdo-recovery.c similarity index 55% rename from vdo/vdoRecovery.c rename to vdo/vdo-recovery.c index 547e280a..5ec60bc8 100644 --- a/vdo/vdoRecovery.c +++ b/vdo/vdo-recovery.c @@ -1,80 +1,140 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/vdoRecovery.c#47 $ */ -#include "vdoRecoveryInternals.h" +#include "vdo-recovery.h" #include "logger.h" -#include "memoryAlloc.h" +#include "memory-alloc.h" #include "permassert.h" -#include "blockAllocator.h" -#include "blockAllocatorInternals.h" -#include "blockMapInternals.h" -#include "blockMapPage.h" -#include "blockMapRecovery.h" +#include "block-allocator.h" +#include "block-map.h" +#include "block-map-page.h" +#include "block-map-recovery.h" #include "completion.h" -#include "numUtils.h" -#include "packedRecoveryJournalBlock.h" -#include "recoveryJournal.h" -#include "recoveryUtils.h" +#include "int-map.h" +#include "journal-point.h" +#include "num-utils.h" +#include "packed-recovery-journal-block.h" +#include "recovery-journal.h" +#include "recovery-utils.h" #include "slab.h" -#include "slabDepot.h" -#include "slabJournal.h" -#include "slabJournalInternals.h" -#include "vdoInternal.h" -#include "waitQueue.h" +#include "slab-depot.h" +#include "slab-journal.h" +#include "thread-config.h" +#include "types.h" +#include "vdo.h" +#include "wait-queue.h" + +/* + * The absolute position of an entry in the recovery journal, including + * the sector number and the entry number within the sector. + */ +struct recovery_point { + sequence_number_t sequence_number; /* Block sequence number */ + uint8_t sector_count; /* Sector number */ + journal_entry_count_t entry_count; /* Entry number */ +}; + +struct recovery_completion { + /* The completion header */ + struct vdo_completion completion; + /* The sub-task completion */ + struct vdo_completion sub_task_completion; + /* The struct vdo in question */ + struct vdo *vdo; + /* The struct block_allocator whose journals are being recovered */ + struct block_allocator *allocator; + /* A buffer to hold the data read off disk */ + char *journal_data; + /* The number of increfs */ + size_t incref_count; + + /* The entry data for the block map recovery */ + struct numbered_block_mapping *entries; + /* The number of entries in the entry array */ + size_t entry_count; + /* + * The sequence number of the first valid block for block map recovery + */ + sequence_number_t block_map_head; + /* + * The sequence number of the first valid block for slab journal replay + */ + sequence_number_t slab_journal_head; + /* + * The sequence number of the last valid block of the journal (if + * known) + */ + sequence_number_t tail; + /* + * The highest sequence number of the journal, not the same as the tail, + * since the tail ignores blocks after the first hole. + */ + sequence_number_t highest_tail; + + /* A location just beyond the last valid entry of the journal */ + struct recovery_point tail_recovery_point; + /* The location of the next recovery journal entry to apply */ + struct recovery_point next_recovery_point; + /* The number of logical blocks currently known to be in use */ + block_count_t logical_blocks_used; + /* The number of block map data blocks known to be allocated */ + block_count_t block_map_data_blocks; + /* The journal point to give to the next synthesized decref */ + struct journal_point next_journal_point; + /* The number of entries played into slab journals */ + size_t entries_added_to_slab_journals; + + /* Decref synthesis fields */ + + /* An int_map for use in finding which slots are missing decrefs */ + struct int_map *slot_entry_map; + /* The number of synthesized decrefs */ + size_t missing_decref_count; + /* The number of incomplete decrefs */ + size_t incomplete_decref_count; + /* The fake journal point of the next missing decref */ + struct journal_point next_synthesized_journal_point; + /* The queue of missing decrefs */ + struct wait_queue missing_decrefs[]; +}; enum { - // The int map needs capacity of twice the number of VIOs in the system. + /* The int map needs capacity of twice the number of VIOs in the system. */ INT_MAP_CAPACITY = MAXIMUM_VDO_USER_VIOS * 2, - // There can be as many missing decrefs as there are VIOs in the system. + /* There can be as many missing decrefs as there are VIOs in the system. */ MAXIMUM_SYNTHESIZED_DECREFS = MAXIMUM_VDO_USER_VIOS, }; struct missing_decref { - /** A waiter for queueing this object */ + /* A waiter for queueing this object */ struct waiter waiter; - /** The parent of this object */ + /* The parent of this object */ struct recovery_completion *recovery; - /** Whether this decref is complete */ + /* Whether this decref is complete */ bool complete; - /** The slot for which the last decref was lost */ + /* The slot for which the last decref was lost */ struct block_map_slot slot; - /** The penultimate block map entry for this LBN */ + /* The penultimate block map entry for this LBN */ struct data_location penultimate_mapping; - /** The page completion used to fetch the block map page for this LBN */ + /* The page completion used to fetch the block map page for this LBN */ struct vdo_page_completion page_completion; - /** The journal point which will be used for this entry */ + /* The journal point which will be used for this entry */ struct journal_point journal_point; - /** The slab journal to which this entry will be applied */ + /* The slab journal to which this entry will be applied */ struct slab_journal *slab_journal; }; /** - * Convert a waiter to the missing decref of which it is a part. + * as_missing_decref() - Convert a waiter to the missing decref of which it is + * a part. + * @waiter: The waiter to convert. * - * @param waiter The waiter to convert - * - * @return The missing_decref wrapping the waiter - **/ + * Return: The missing_decref wrapping the waiter. + */ static inline struct missing_decref * __must_check as_missing_decref(struct waiter *waiter) { @@ -82,21 +142,23 @@ as_missing_decref(struct waiter *waiter) } /** - * Enqueue a missing_decref. If the enqueue fails, enter read-only mode. + * enqueue_missing_decref() - Enqueue a missing_decref. + * @queue: The queue on which to enqueue the decref. + * @decref: The missing_decref to enqueue. * - * @param queue The queue on which to enqueue the decref - * @param decref The missing_decref to enqueue + * If the enqueue fails, enter read-only mode. * - * @return VDO_SUCCESS or an error - **/ + * Return: VDO_SUCCESS or an error. + */ static int enqueue_missing_decref(struct wait_queue *queue, struct missing_decref *decref) { int result = enqueue_waiter(queue, &decref->waiter); + if (result != VDO_SUCCESS) { vdo_enter_read_only_mode(decref->recovery->vdo->read_only_notifier, result); - set_vdo_completion_result(&decref->recovery->completion, result); + vdo_set_completion_result(&decref->recovery->completion, result); UDS_FREE(decref); } @@ -104,28 +166,52 @@ static int enqueue_missing_decref(struct wait_queue *queue, } /** - * Convert a block_map_slot into a unique uint64_t. + * as_vdo_recovery_completion() - Convert a generic completion to a + * recovery_completion. + * @completion: The completion to convert. * - * @param slot The block map slot to convert. + * Return: The recovery_completion. + */ +static inline struct recovery_completion * __must_check +as_vdo_recovery_completion(struct vdo_completion *completion) +{ + vdo_assert_completion_type(completion->type, VDO_RECOVERY_COMPLETION); + return container_of(completion, struct recovery_completion, completion); +} + +/** + * slot_as_number() - Convert a block_map_slot into a unique uint64_t. + * @slot: The block map slot to convert.. * - * @return a one-to-one mappable uint64_t. - **/ + * Return: A one-to-one mappable uint64_t. + */ static uint64_t slot_as_number(struct block_map_slot slot) { return (((uint64_t) slot.pbn << 10) + slot.slot); } /** - * Create a missing_decref and enqueue it to wait for a determination of its - * penultimate mapping. + * is_replaying() - Check whether a vdo was replaying the recovery journal + * into the block map when it crashed. + * @vdo: The vdo to query. * - * @param [in] recovery The parent recovery completion - * @param [in] entry The recovery journal entry for the increment which - * is missing a decref - * @param [out] decref_ptr A pointer to hold the new missing_decref + * Return: true if the vdo crashed while reconstructing the block map. + */ +static bool __must_check is_replaying(const struct vdo *vdo) +{ + return (vdo_get_state(vdo) == VDO_REPLAYING); +} + +/** + * make_missing_decref() - Create a missing_decref and enqueue it to wait for + * a determination of its penultimate mapping. + * @recovery: The parent recovery completion. + * @entry: The recovery journal entry for the increment which is missing a + * decref. + * @decref_ptr: A pointer to hold the new missing_decref. * - * @return VDO_SUCCESS or an error code - **/ + * Return: VDO_SUCCESS or an error code. + */ static int __must_check make_missing_decref(struct recovery_completion *recovery, struct recovery_journal_entry entry, @@ -133,6 +219,7 @@ make_missing_decref(struct recovery_completion *recovery, { struct missing_decref *decref; int result = UDS_ALLOCATE(1, struct missing_decref, __func__, &decref); + if (result != VDO_SUCCESS) { return result; } @@ -173,10 +260,10 @@ make_missing_decref(struct recovery_completion *recovery, } /** - * Move the given recovery point forward by one entry. - * - * @param point The recovery point to alter - **/ + * increment_recovery_point() - Move the given recovery point forward by one + * entry. + * @point: The recovery point to alter. + */ static void increment_recovery_point(struct recovery_point *point) { point->entry_count++; @@ -195,10 +282,10 @@ static void increment_recovery_point(struct recovery_point *point) } /** - * Move the given recovery point backwards by one entry. - * - * @param point The recovery point to alter - **/ + * decrement_recovery_point() - Move the given recovery point backwards by one + * entry. + * @point: The recovery point to alter. + */ static void decrement_recovery_point(struct recovery_point *point) { STATIC_ASSERT(RECOVERY_JOURNAL_ENTRIES_PER_LAST_SECTOR > 0); @@ -221,13 +308,13 @@ static void decrement_recovery_point(struct recovery_point *point) } /** - * Check whether the first point precedes the second point. + * before_recovery_point() - Check whether the first point precedes the second + * point. + * @first: The first recovery point. + * @second: The second recovery point. * - * @param first The first recovery point - * @param second The second recovery point - * - * @return true if the first point precedes the second point - **/ + * Return: true if the first point precedes the second point. + */ static bool __must_check before_recovery_point(const struct recovery_point *first, const struct recovery_point *second) @@ -249,55 +336,101 @@ before_recovery_point(const struct recovery_point *first, } /** - * Prepare the sub-task completion. - * - * @param recovery The recovery_completion whose sub-task completion is - * to be prepared - * @param callback The callback to register for the next sub-task - * @param error_handler The error handler for the next sub-task - * @param zone_type The type of zone on which the callback or - * error_handler should run - **/ + * prepare_sub_task() - Prepare the sub-task completion. + * @recovery: The recovery_completion whose sub-task completion is to be + * prepared. + * @callback: The callback to register for the next sub-task. + * @error_handler: The error handler for the next sub-task. + * @zone_type: The type of zone on which the callback or error_handler should + * run. + */ static void prepare_sub_task(struct recovery_completion *recovery, vdo_action callback, vdo_action error_handler, enum vdo_zone_type zone_type) { const struct thread_config *thread_config = - get_vdo_thread_config(recovery->vdo); + recovery->vdo->thread_config; thread_id_t thread_id; + switch (zone_type) { - case ZONE_TYPE_LOGICAL: - // All blockmap access is done on single thread, so use logical - // zone 0. + case VDO_ZONE_TYPE_LOGICAL: + /* + * All blockmap access is done on single thread, so use logical + * zone 0. + */ thread_id = vdo_get_logical_zone_thread(thread_config, 0); break; - case ZONE_TYPE_PHYSICAL: + case VDO_ZONE_TYPE_PHYSICAL: thread_id = recovery->allocator->thread_id; break; - case ZONE_TYPE_ADMIN: + case VDO_ZONE_TYPE_ADMIN: default: thread_id = thread_config->admin_thread; } - prepare_vdo_completion(&recovery->sub_task_completion, + vdo_prepare_completion(&recovery->sub_task_completion, callback, error_handler, thread_id, &recovery->completion); } -/**********************************************************************/ -int make_vdo_recovery_completion(struct vdo *vdo, - struct recovery_completion **recovery_ptr) +/** + * free_missing_decref() - A waiter callback to free missing_decrefs. + * + * Implements waiter_callback. + */ +static void free_missing_decref(struct waiter *waiter, + void *context __always_unused) +{ + UDS_FREE(as_missing_decref(waiter)); +} + +/** + * free_vdo_recovery_completion() - Free a recovery_completion and all + * underlying structures. + * @recovery: The recovery completion to free. + */ +static void free_vdo_recovery_completion(struct recovery_completion *recovery) +{ + zone_count_t zone, zone_count; + + if (recovery == NULL) { + return; + } + + free_int_map(UDS_FORGET(recovery->slot_entry_map)); + zone_count = recovery->vdo->thread_config->physical_zone_count; + for (zone = 0; zone < zone_count; zone++) { + notify_all_waiters(&recovery->missing_decrefs[zone], + free_missing_decref, NULL); + } + + UDS_FREE(UDS_FORGET(recovery->journal_data)); + UDS_FREE(UDS_FORGET(recovery->entries)); + UDS_FREE(recovery); +} + +/** + * vdo_make_recovery_completion() - Allocate and initialize a + * recovery_completion. + * @vdo: The vdo in question. + * @recovery_ptr: A pointer to hold the new recovery_completion. + * + * Return: VDO_SUCCESS or a status code. + */ +static int __must_check +vdo_make_recovery_completion(struct vdo *vdo, + struct recovery_completion **recovery_ptr) { - const struct thread_config *thread_config = get_vdo_thread_config(vdo); struct recovery_completion *recovery; - zone_count_t z; + zone_count_t zone; + zone_count_t zone_count = vdo->thread_config->physical_zone_count; int result = UDS_ALLOCATE_EXTENDED(struct recovery_completion, - thread_config->physical_zone_count, + zone_count, struct list_head, __func__, &recovery); @@ -306,13 +439,13 @@ int make_vdo_recovery_completion(struct vdo *vdo, } recovery->vdo = vdo; - for (z = 0; z < thread_config->physical_zone_count; z++) { - initialize_wait_queue(&recovery->missing_decrefs[z]); + for (zone = 0; zone < zone_count; zone++) { + initialize_wait_queue(&recovery->missing_decrefs[zone]); } - initialize_vdo_completion(&recovery->completion, vdo, + vdo_initialize_completion(&recovery->completion, vdo, VDO_RECOVERY_COMPLETION); - initialize_vdo_completion(&recovery->sub_task_completion, vdo, + vdo_initialize_completion(&recovery->sub_task_completion, vdo, VDO_SUB_TASK_COMPLETION); result = make_int_map(INT_MAP_CAPACITY, 0, &recovery->slot_entry_map); @@ -326,43 +459,10 @@ int make_vdo_recovery_completion(struct vdo *vdo, } /** - * A waiter callback to free missing_decrefs. - * - * Implements waiter_callback. - **/ -static void free_missing_decref(struct waiter *waiter, - void *context __always_unused) -{ - UDS_FREE(as_missing_decref(waiter)); -} - -/**********************************************************************/ -void free_vdo_recovery_completion(struct recovery_completion *recovery) -{ - const struct thread_config *thread_config; - zone_count_t z; - - if (recovery == NULL) { - return; - } - - free_int_map(UDS_FORGET(recovery->slot_entry_map)); - thread_config = get_vdo_thread_config(recovery->vdo); - for (z = 0; z < thread_config->physical_zone_count; z++) { - notify_all_waiters(&recovery->missing_decrefs[z], - free_missing_decref, NULL); - } - - UDS_FREE(UDS_FORGET(recovery->journal_data)); - UDS_FREE(UDS_FORGET(recovery->entries)); - UDS_FREE(recovery); -} - -/** - * Finish recovering, free the recovery completion and notify the parent. - * - * @param completion The recovery completion - **/ + * finish_recovery() - Finish recovering, free the recovery completion and + * notify the parent. + * @completion: The recovery completion. + */ static void finish_recovery(struct vdo_completion *completion) { int result; @@ -371,23 +471,25 @@ static void finish_recovery(struct vdo_completion *completion) as_vdo_recovery_completion(completion); struct vdo *vdo = recovery->vdo; uint64_t recovery_count = ++vdo->states.vdo.complete_recoveries; - initialize_vdo_recovery_journal_post_recovery(vdo->recovery_journal, + + vdo_initialize_recovery_journal_post_recovery(vdo->recovery_journal, recovery_count, recovery->highest_tail); free_vdo_recovery_completion(UDS_FORGET(recovery)); uds_log_info("Rebuild complete"); - // Now that we've freed the recovery completion and its vast array of - // journal entries, we can allocate refcounts. + /* + * Now that we've freed the recovery completion and its vast array of + * journal entries, we can allocate refcounts. + */ result = vdo_allocate_slab_ref_counts(vdo->depot); - finish_vdo_completion(parent, result); + vdo_finish_completion(parent, result); } /** - * Handle a recovery error. - * - * @param completion The recovery completion - **/ + * abort_recovery() - Handle a recovery error. + * @completion: The recovery completion. + */ static void abort_recovery(struct vdo_completion *completion) { struct vdo_completion *parent = completion->parent; @@ -396,17 +498,16 @@ static void abort_recovery(struct vdo_completion *completion) as_vdo_recovery_completion(completion); free_vdo_recovery_completion(UDS_FORGET(recovery)); uds_log_warning("Recovery aborted"); - finish_vdo_completion(parent, result); + vdo_finish_completion(parent, result); } /** - * Abort a recovery if there is an error. - * - * @param result The result to check - * @param recovery The recovery completion + * abort_recovery_on_error() - Abort a recovery if there is an error. + * @result: The result to check. + * @recovery: The recovery completion. * - * @return true if the result was an error - **/ + * Return: true if the result was an error. + */ static bool __must_check abort_recovery_on_error(int result, struct recovery_completion *recovery) { @@ -414,41 +515,41 @@ abort_recovery_on_error(int result, struct recovery_completion *recovery) return false; } - finish_vdo_completion(&recovery->completion, result); + vdo_finish_completion(&recovery->completion, result); return true; } /** - * Unpack the recovery journal entry associated with the given recovery point. - * - * @param recovery The recovery completion - * @param point The recovery point + * get_entry() - Unpack the recovery journal entry associated with the given + * recovery point. + * @recovery: The recovery completion. + * @point: The recovery point. * - * @return The unpacked contents of the matching recovery journal entry - **/ + * Return: The unpacked contents of the matching recovery journal entry. + */ static struct recovery_journal_entry get_entry(const struct recovery_completion *recovery, const struct recovery_point *point) { struct recovery_journal *journal = recovery->vdo->recovery_journal; physical_block_number_t block_number = - get_vdo_recovery_journal_block_number(journal, + vdo_get_recovery_journal_block_number(journal, point->sequence_number); off_t sector_offset = (block_number * VDO_BLOCK_SIZE) + (point->sector_count * VDO_SECTOR_SIZE); struct packed_journal_sector *sector = (struct packed_journal_sector *) &recovery->journal_data[sector_offset]; - return unpack_vdo_recovery_journal_entry(§or->entries[point->entry_count]); + return vdo_unpack_recovery_journal_entry(§or->entries[point->entry_count]); } /** - * Create an array of all valid journal entries, in order, and store it in the - * recovery completion. + * extract_journal_entries() - Create an array of all valid journal entries, + * in order, and store it in the recovery + * completion. + * @recovery: The recovery completion. * - * @param recovery The recovery completion - * - * @return VDO_SUCCESS or an error code - **/ + * Return: VDO_SUCCESS or an error code. + */ static int extract_journal_entries(struct recovery_completion *recovery) { struct recovery_point recovery_point = { @@ -473,7 +574,7 @@ static int extract_journal_entries(struct recovery_completion *recovery) &recovery->tail_recovery_point)) { struct recovery_journal_entry entry = get_entry(recovery, &recovery_point); - result = validate_vdo_recovery_journal_entry(recovery->vdo, + result = vdo_validate_recovery_journal_entry(recovery->vdo, &entry); if (result != VDO_SUCCESS) { vdo_enter_read_only_mode(recovery->vdo->read_only_notifier, @@ -481,12 +582,12 @@ static int extract_journal_entries(struct recovery_completion *recovery) return result; } - if (is_vdo_journal_increment_operation(entry.operation)) { + if (vdo_is_journal_increment_operation(entry.operation)) { recovery->entries[recovery->entry_count] = (struct numbered_block_mapping) { .block_map_slot = entry.slot, .block_map_entry = - pack_vdo_pbn(entry.mapping.pbn, + vdo_pack_pbn(entry.mapping.pbn, entry.mapping.state), .number = recovery->entry_count, }; @@ -507,69 +608,76 @@ static int extract_journal_entries(struct recovery_completion *recovery) } /** - * Extract journal entries and recover the block map. This callback is - * registered in start_super_block_save(). + * launch_block_map_recovery() - Extract journal entries and recover the block + * map. + * @completion: The sub-task completion. * - * @param completion The sub-task completion - **/ + * This callback is registered in start_super_block_save(). + */ static void launch_block_map_recovery(struct vdo_completion *completion) { int result; struct recovery_completion *recovery = as_vdo_recovery_completion(completion->parent); struct vdo *vdo = recovery->vdo; - assert_on_logical_zone_thread(vdo, 0, __func__); - // Extract the journal entries for the block map recovery. + vdo_assert_on_logical_zone_thread(vdo, 0, __func__); + + /* Extract the journal entries for the block map recovery. */ result = extract_journal_entries(recovery); if (abort_recovery_on_error(result, recovery)) { return; } - prepare_vdo_completion_to_finish_parent(completion, &recovery->completion); - recover_vdo_block_map(vdo, recovery->entry_count, recovery->entries, + vdo_prepare_completion_to_finish_parent(completion, &recovery->completion); + vdo_recover_block_map(vdo, recovery->entry_count, recovery->entries, completion); } /** - * Finish flushing all slab journals and start a write of the super block. - * This callback is registered in add_synthesized_entries(). + * start_super_block_save() - Finish flushing all slab journals and start a + * write of the super block. + * @completion: The sub-task completion. * - * @param completion The sub-task completion - **/ + * This callback is registered in add_synthesized_entries(). + */ static void start_super_block_save(struct vdo_completion *completion) { struct recovery_completion *recovery = as_vdo_recovery_completion(completion->parent); struct vdo *vdo = recovery->vdo; - assert_on_admin_thread(vdo, __func__); + + vdo_assert_on_admin_thread(vdo, __func__); uds_log_info("Saving recovery progress"); - set_vdo_state(vdo, VDO_REPLAYING); + vdo_set_state(vdo, VDO_REPLAYING); - // The block map access which follows the super block save must be done - // on a logical thread. + /* + * The block map access which follows the super block save must be done + * on a logical thread. + */ prepare_sub_task(recovery, launch_block_map_recovery, - finish_vdo_completion_parent_callback, - ZONE_TYPE_LOGICAL); - save_vdo_components(vdo, completion); + vdo_finish_completion_parent_callback, + VDO_ZONE_TYPE_LOGICAL); + vdo_save_components(vdo, completion); } /** - * The callback from loading the slab depot. It will update the logical blocks - * and block map data blocks counts in the recovery journal and then drain the - * slab depot in order to commit the recovered slab journals. It is registered - * in apply_to_depot(). + * finish_recovering_depot() - The callback from loading the slab depot. + * @completion: The sub-task completion. * - * @param completion The sub-task completion - **/ + * Updates the logical blocks and block map data blocks counts in the recovery + * journal and then drains the slab depot in order to commit the recovered + * slab journals. It is registered in apply_to_depot(). + */ static void finish_recovering_depot(struct vdo_completion *completion) { struct recovery_completion *recovery = as_vdo_recovery_completion(completion->parent); struct vdo *vdo = recovery->vdo; - assert_on_admin_thread(vdo, __func__); + + vdo_assert_on_admin_thread(vdo, __func__); uds_log_info("Replayed %zu journal entries into slab journals", recovery->entries_added_to_slab_journals); @@ -582,32 +690,34 @@ static void finish_recovering_depot(struct vdo_completion *completion) prepare_sub_task(recovery, start_super_block_save, - finish_vdo_completion_parent_callback, - ZONE_TYPE_ADMIN); - drain_vdo_slab_depot(vdo->depot, VDO_ADMIN_STATE_RECOVERING, completion); + vdo_finish_completion_parent_callback, + VDO_ZONE_TYPE_ADMIN); + vdo_drain_slab_depot(vdo->depot, VDO_ADMIN_STATE_RECOVERING, completion); } /** - * The error handler for recovering slab journals. It will skip any remaining - * recovery on the current zone and propagate the error. It is registered in - * add_slab_journal_entries() and add_synthesized_entries(). + * handle_add_slab_journal_entry_error() - The error handler for recovering + * slab journals. + * @completion: The completion of the block allocator being recovered. * - * @param completion The completion of the block allocator being recovered - **/ + * Skips any remaining recovery on the current zone and propagates the error. + * It is registered in add_slab_journal_entries() and + * add_synthesized_entries(). + */ static void handle_add_slab_journal_entry_error(struct vdo_completion *completion) { struct recovery_completion *recovery = as_vdo_recovery_completion(completion->parent); - notify_vdo_slab_journals_are_recovered(recovery->allocator, + vdo_notify_slab_journals_are_recovered(recovery->allocator, completion->result); } /** - * Add synthesized entries into slab journals, waiting when necessary. - * - * @param completion The allocator completion - **/ + * add_synthesized_entries() - Add synthesized entries into slab journals, + * waiting when necessary. + * @completion: The allocator completion. + */ static void add_synthesized_entries(struct vdo_completion *completion) { struct recovery_completion *recovery = @@ -615,8 +725,8 @@ static void add_synthesized_entries(struct vdo_completion *completion) struct wait_queue *missing_decrefs = &recovery->missing_decrefs[recovery->allocator->zone_number]; - // Get ready in case we need to enqueue again - prepare_vdo_completion(completion, + /* Get ready in case we need to enqueue again */ + vdo_prepare_completion(completion, add_synthesized_entries, handle_add_slab_journal_entry_error, completion->callback_thread_id, @@ -624,9 +734,9 @@ static void add_synthesized_entries(struct vdo_completion *completion) while (has_waiters(missing_decrefs)) { struct missing_decref *decref = as_missing_decref(get_first_waiter(missing_decrefs)); - if (!attempt_replay_into_vdo_slab_journal(decref->slab_journal, + if (!vdo_attempt_replay_into_slab_journal(decref->slab_journal, decref->penultimate_mapping.pbn, - DATA_DECREMENT, + VDO_JOURNAL_DATA_DECREMENT, &decref->journal_point, completion)) { return; @@ -636,24 +746,27 @@ static void add_synthesized_entries(struct vdo_completion *completion) UDS_FREE(decref); } - notify_vdo_slab_journals_are_recovered(recovery->allocator, + vdo_notify_slab_journals_are_recovered(recovery->allocator, VDO_SUCCESS); } /** - * Determine the LBNs used count as of the end of the journal (but - * not including any changes to that count from entries that will be - * synthesized later). + * compute_usages() - Determine the LBNs used count as of the end of the + * journal. + * @recovery: The recovery completion. * - * @param recovery The recovery completion + * Does not include any changes to that count from entries that will be + * synthesized later). * - * @return VDO_SUCCESS or an error - **/ + * Return: VDO_SUCCESS or an error. + */ __attribute__((__noinline__)) static int compute_usages(struct recovery_completion *recovery) { - // XXX VDO-5182: function is declared noinline to avoid what is likely - // a spurious valgrind error about this structure being uninitialized. + /* + * XXX VDO-5182: function is declared noinline to avoid what is likely + * a spurious valgrind error about this structure being uninitialized. + */ struct recovery_point recovery_point = { .sequence_number = recovery->tail, .sector_count = 1, @@ -661,12 +774,13 @@ static int compute_usages(struct recovery_completion *recovery) }; struct recovery_journal *journal = recovery->vdo->recovery_journal; struct packed_journal_header *tail_header = - get_vdo_recovery_journal_block_header(journal, + vdo_get_recovery_journal_block_header(journal, recovery->journal_data, recovery->tail); struct recovery_block_header unpacked; - unpack_vdo_recovery_block_header(tail_header, &unpacked); + + vdo_unpack_recovery_block_header(tail_header, &unpacked); recovery->logical_blocks_used = unpacked.logical_blocks_used; recovery->block_map_data_blocks = unpacked.block_map_data_blocks; @@ -676,15 +790,15 @@ static int compute_usages(struct recovery_completion *recovery) get_entry(recovery, &recovery_point); if (vdo_is_mapped_location(&entry.mapping)) { switch (entry.operation) { - case DATA_INCREMENT: + case VDO_JOURNAL_DATA_INCREMENT: recovery->logical_blocks_used++; break; - case DATA_DECREMENT: + case VDO_JOURNAL_DATA_DECREMENT: recovery->logical_blocks_used--; break; - case BLOCK_MAP_INCREMENT: + case VDO_JOURNAL_BLOCK_MAP_INCREMENT: recovery->block_map_data_blocks++; break; @@ -705,27 +819,27 @@ static int compute_usages(struct recovery_completion *recovery) } /** - * Advance the current recovery and journal points. - * - * @param recovery The recovery_completion whose points are to be - * advanced - * @param entries_per_block The number of entries in a recovery journal block - **/ + * advance_points() - Advance the current recovery and journal points. + * @recovery: The recovery_completion whose points are to be advanced. + * @entries_per_block: The number of entries in a recovery journal block. + */ static void advance_points(struct recovery_completion *recovery, journal_entry_count_t entries_per_block) { increment_recovery_point(&recovery->next_recovery_point); - advance_vdo_journal_point(&recovery->next_journal_point, + vdo_advance_journal_point(&recovery->next_journal_point, entries_per_block); } /** - * Replay recovery journal entries into the slab journals of the allocator - * currently being recovered, waiting for slab journal tailblock space when - * necessary. This method is its own callback. + * add_slab_journal_entries() - Replay recovery journal entries into the slab + * journals of the allocator currently being + * recovered. + * @completion: The allocator completion. * - * @param completion The allocator completion - **/ + * Waits for slab journal tailblock space when necessary. This method is its + * own callback. + */ static void add_slab_journal_entries(struct vdo_completion *completion) { struct recovery_point *recovery_point; @@ -734,8 +848,8 @@ static void add_slab_journal_entries(struct vdo_completion *completion) struct vdo *vdo = recovery->vdo; struct recovery_journal *journal = vdo->recovery_journal; - // Get ready in case we need to enqueue again. - prepare_vdo_completion(completion, + /* Get ready in case we need to enqueue again. */ + vdo_prepare_completion(completion, add_slab_journal_entries, handle_add_slab_journal_entry_error, completion->callback_thread_id, @@ -747,11 +861,12 @@ static void add_slab_journal_entries(struct vdo_completion *completion) struct vdo_slab *slab; struct recovery_journal_entry entry = get_entry(recovery, recovery_point); - int result = validate_vdo_recovery_journal_entry(vdo, &entry); + int result = vdo_validate_recovery_journal_entry(vdo, &entry); + if (result != VDO_SUCCESS) { vdo_enter_read_only_mode(journal->read_only_notifier, result); - finish_vdo_completion(completion, result); + vdo_finish_completion(completion, result); return; } @@ -759,12 +874,12 @@ static void add_slab_journal_entries(struct vdo_completion *completion) continue; } - slab = get_vdo_slab(vdo->depot, entry.mapping.pbn); + slab = vdo_get_slab(vdo->depot, entry.mapping.pbn); if (slab->allocator != recovery->allocator) { continue; } - if (!attempt_replay_into_vdo_slab_journal(slab->journal, + if (!vdo_attempt_replay_into_slab_journal(slab->journal, entry.mapping.pbn, entry.operation, &recovery->next_journal_point, @@ -780,17 +895,27 @@ static void add_slab_journal_entries(struct vdo_completion *completion) add_synthesized_entries(completion); } -/**********************************************************************/ +/** + * vdo_replay_into_slab_journals() - Replay recovery journal entries in the + * slab journals of slabs owned by a given + * block_allocator. + * @allocator: The allocator whose slab journals are to be recovered. + * @completion: The completion to use for waiting on slab journal space. + * @context: The slab depot load context supplied by a recovery when it loads + * the depot. + */ void vdo_replay_into_slab_journals(struct block_allocator *allocator, struct vdo_completion *completion, void *context) { struct recovery_completion *recovery = context; - assert_on_physical_zone_thread(recovery->vdo, allocator->zone_number, - __func__); + + vdo_assert_on_physical_zone_thread(recovery->vdo, + allocator->zone_number, + __func__); if ((recovery->journal_data == NULL) || is_replaying(recovery->vdo)) { - // there's nothing to replay - notify_vdo_slab_journals_are_recovered(allocator, VDO_SUCCESS); + /* there's nothing to replay */ + vdo_notify_slab_journals_are_recovered(allocator, VDO_SUCCESS); return; } @@ -813,50 +938,53 @@ void vdo_replay_into_slab_journals(struct block_allocator *allocator, } /** - * A waiter callback to enqueue a missing_decref on the queue for the physical - * zone in which it will be applied. + * queue_on_physical_zone() - A waiter callback to enqueue a missing_decref on + * the queue for the physical zone in which it will + * be applied. * * Implements waiter_callback. - **/ + */ static void queue_on_physical_zone(struct waiter *waiter, void *context) { zone_count_t zone_number; struct missing_decref *decref = as_missing_decref(waiter); struct data_location mapping = decref->penultimate_mapping; + if (vdo_is_mapped_location(&mapping)) { decref->recovery->logical_blocks_used--; } if (mapping.pbn == VDO_ZERO_BLOCK) { - // Decrefs of zero are not applied to slab journals. + /* Decrefs of zero are not applied to slab journals. */ UDS_FREE(decref); return; } decref->slab_journal = - get_vdo_slab_journal((struct slab_depot *) context, mapping.pbn); + vdo_get_slab_journal((struct slab_depot *) context, mapping.pbn); zone_number = decref->slab_journal->slab->allocator->zone_number; enqueue_missing_decref(&decref->recovery->missing_decrefs[zone_number], decref); } /** - * Queue each missing decref on the slab journal to which it is to be applied - * then load the slab depot. This callback is registered in - * find_slab_journal_entries(). + * apply_to_depot() - Queue each missing decref on the slab journal to which + * it is to be applied then load the slab depot. + * @completion: The sub-task completion. * - * @param completion The sub-task completion - **/ + * This callback is registered in find_slab_journal_entries(). + */ static void apply_to_depot(struct vdo_completion *completion) { struct recovery_completion *recovery = as_vdo_recovery_completion(completion->parent); - struct slab_depot *depot = get_slab_depot(recovery->vdo); - assert_on_admin_thread(recovery->vdo, __func__); + struct slab_depot *depot = recovery->vdo->depot; + + vdo_assert_on_admin_thread(recovery->vdo, __func__); prepare_sub_task(recovery, finish_recovering_depot, - finish_vdo_completion_parent_callback, - ZONE_TYPE_ADMIN); + vdo_finish_completion_parent_callback, + VDO_ZONE_TYPE_ADMIN); notify_all_waiters(&recovery->missing_decrefs[0], queue_on_physical_zone, depot); @@ -864,24 +992,26 @@ static void apply_to_depot(struct vdo_completion *completion) return; } - load_vdo_slab_depot(depot, VDO_ADMIN_STATE_LOADING_FOR_RECOVERY, + vdo_load_slab_depot(depot, VDO_ADMIN_STATE_LOADING_FOR_RECOVERY, completion, recovery); } /** - * Validate the location of the penultimate mapping for a missing_decref. If it - * is valid, enqueue it for the appropriate physical zone or account for it. - * Otherwise, dispose of it and signal an error. + * record_missing_decref() - Validate the location of the penultimate mapping + * for a missing_decref. + * @decref: The decref whose penultimate mapping has just been found. + * @location: The penultimate mapping. + * @error_code: The error code to use if the location is invalid. * - * @param decref The decref whose penultimate mapping has just been found - * @param location The penultimate mapping - * @param error_code The error code to use if the location is invalid - **/ + * If it is valid, enqueue it for the appropriate physical zone or account for + * it. Otherwise, dispose of it and signal an error. + */ static int record_missing_decref(struct missing_decref *decref, struct data_location location, int error_code) { struct recovery_completion *recovery = decref->recovery; + recovery->incomplete_decref_count--; if (vdo_is_valid_location(&location) && vdo_is_physical_data_block(recovery->vdo->depot, location.pbn)) { @@ -890,9 +1020,9 @@ static int record_missing_decref(struct missing_decref *decref, return VDO_SUCCESS; } - // The location was invalid + /* The location was invalid */ vdo_enter_read_only_mode(recovery->vdo->read_only_notifier, error_code); - set_vdo_completion_result(&recovery->completion, error_code); + vdo_set_completion_result(&recovery->completion, error_code); uds_log_error_strerror(error_code, "Invalid mapping for pbn %llu with state %u", (unsigned long long) location.pbn, @@ -901,11 +1031,12 @@ static int record_missing_decref(struct missing_decref *decref, } /** - * Find the block map slots with missing decrefs. + * find_missing_decrefs() - Find the block map slots with missing decrefs. + * @recovery: The recovery completion. * - * To find the slots missing decrefs, we iterate through the journal in reverse - * so we see decrefs before increfs; if we see an incref before its paired - * decref, we instantly know this incref is missing its decref. + * To find the slots missing decrefs, we iterate through the journal in + * reverse so we see decrefs before increfs; if we see an incref before its + * paired decref, we instantly know this incref is missing its decref. * * Simultaneously, we attempt to determine the missing decref. If there is a * missing decref, and at least two increfs for that slot, we know we should @@ -913,16 +1044,16 @@ static int record_missing_decref(struct missing_decref *decref, * incref for that slot: we must synthesize the decref out of the block map * instead of the recovery journal. * - * @param recovery The recovery completion - * - * @return VDO_SUCCESS or an error code - **/ + * Return: VDO_SUCCESS or an error code. + */ static int __must_check find_missing_decrefs(struct recovery_completion *recovery) { - // This placeholder decref is used to mark lbns for which we have - // observed a decref but not the paired incref (going backwards through - // the journal). + /* + * This placeholder decref is used to mark lbns for which we have + * observed a decref but not the paired incref (going backwards through + * the journal). + */ struct missing_decref found_decref; int result; @@ -931,8 +1062,10 @@ find_missing_decrefs(struct recovery_completion *recovery) struct recovery_point recovery_point; struct int_map *slot_entry_map = recovery->slot_entry_map; - // A buffer is allocated based on the number of incref entries found, so - // use the earliest head. + /* + * A buffer is allocated based on the number of incref entries found, so + * use the earliest head. + */ sequence_number_t head = min(recovery->block_map_head, recovery->slab_journal_head); struct recovery_point head_point = { @@ -941,8 +1074,10 @@ find_missing_decrefs(struct recovery_completion *recovery) .entry_count = 0, }; - // Set up for the first fake journal point that will be used for a - // synthesized entry. + /* + * Set up for the first fake journal point that will be used for a + * synthesized entry. + */ recovery->next_synthesized_journal_point = (struct journal_point) { .sequence_number = recovery->tail, .entry_count = @@ -954,10 +1089,12 @@ find_missing_decrefs(struct recovery_completion *recovery) decrement_recovery_point(&recovery_point); entry = get_entry(recovery, &recovery_point); - if (!is_vdo_journal_increment_operation(entry.operation)) { - // Observe that we've seen a decref before its incref, - // but only if the int_map does not contain an unpaired - // incref for this lbn. + if (!vdo_is_journal_increment_operation(entry.operation)) { + /* + * Observe that we've seen a decref before its incref, + * but only if the int_map does not contain an unpaired + * incref for this lbn. + */ result = int_map_put(slot_entry_map, slot_as_number(entry.slot), &found_decref, @@ -973,8 +1110,8 @@ find_missing_decrefs(struct recovery_completion *recovery) recovery->incref_count++; decref = int_map_remove(slot_entry_map, - slot_as_number(entry.slot)); - if (entry.operation == BLOCK_MAP_INCREMENT) { + slot_as_number(entry.slot)); + if (entry.operation == VDO_JOURNAL_BLOCK_MAP_INCREMENT) { if (decref != NULL) { return uds_log_error_strerror(VDO_CORRUPT_JOURNAL, "decref found for block map block %llu with state %u", @@ -982,20 +1119,26 @@ find_missing_decrefs(struct recovery_completion *recovery) entry.mapping.state); } - // There are no decrefs for block map pages, so they - // can't be missing. + /* + * There are no decrefs for block map pages, so they + * can't be missing. + */ continue; } if (decref == &found_decref) { - // This incref already had a decref in the intmap, so - // we know it is not missing its decref. + /* + * This incref already had a decref in the intmap, so + * we know it is not missing its decref. + */ continue; } if (decref == NULL) { - // This incref is missing a decref. Add a missing - // decref object. + /* + * This incref is missing a decref. Add a missing + * decref object. + */ result = make_missing_decref(recovery, entry, &decref); if (result != VDO_SUCCESS) { return result; @@ -1030,58 +1173,65 @@ find_missing_decrefs(struct recovery_completion *recovery) } /** - * Process a fetched block map page for a missing decref. This callback is - * registered in find_slab_journal_entries(). + * process_fetched_page() - Process a fetched block map page for a missing + * decref. + * @completion: The page completion which has just finished loading. * - * @param completion The page completion which has just finished loading - **/ + * This callback is registered in find_slab_journal_entries(). + */ static void process_fetched_page(struct vdo_completion *completion) { struct missing_decref *current_decref = completion->parent; struct recovery_completion *recovery = current_decref->recovery; const struct block_map_page *page; struct data_location location; - assert_on_logical_zone_thread(recovery->vdo, 0, __func__); - page = dereference_readable_vdo_page(completion); + vdo_assert_on_logical_zone_thread(recovery->vdo, 0, __func__); + + page = vdo_dereference_readable_page(completion); location = - unpack_vdo_block_map_entry(&page->entries[current_decref->slot.slot]); - release_vdo_page_completion(completion); + vdo_unpack_block_map_entry(&page->entries[current_decref->slot.slot]); + vdo_release_page_completion(completion); record_missing_decref(current_decref, location, VDO_BAD_MAPPING); if (recovery->incomplete_decref_count == 0) { - complete_vdo_completion(&recovery->sub_task_completion); + vdo_complete_completion(&recovery->sub_task_completion); } } /** - * Handle an error fetching a block map page for a missing decref. - * This error handler is registered in find_slab_journal_entries(). + * handle_fetch_error() - Handle an error fetching a block map page for a + * missing decref. + * @completion: The page completion which has just finished loading. * - * @param completion The page completion which has just finished loading - **/ + * This error handler is registered in find_slab_journal_entries(). + */ static void handle_fetch_error(struct vdo_completion *completion) { struct missing_decref *decref = completion->parent; struct recovery_completion *recovery = decref->recovery; - assert_on_logical_zone_thread(recovery->vdo, 0, __func__); - // If we got a VDO_OUT_OF_RANGE error, it is because the pbn we read - // from the journal was bad, so convert the error code - set_vdo_completion_result(&recovery->sub_task_completion, + vdo_assert_on_logical_zone_thread(recovery->vdo, 0, __func__); + + /* + * If we got a VDO_OUT_OF_RANGE error, it is because the pbn we read + * from the journal was bad, so convert the error code + */ + vdo_set_completion_result(&recovery->sub_task_completion, ((completion->result == VDO_OUT_OF_RANGE) ? VDO_CORRUPT_JOURNAL : completion->result)); - release_vdo_page_completion(completion); + vdo_release_page_completion(completion); if (--recovery->incomplete_decref_count == 0) { - complete_vdo_completion(&recovery->sub_task_completion); + vdo_complete_completion(&recovery->sub_task_completion); } } /** - * The waiter callback to requeue a missing decref and launch its page fetch. + * launch_fetch() - The waiter callback to requeue a missing decref and launch + * its page fetch. * * Implements waiter_callback. - **/ + */ static void launch_fetch(struct waiter *waiter, void *context) { struct missing_decref *decref = as_missing_decref(waiter); @@ -1094,26 +1244,29 @@ static void launch_fetch(struct waiter *waiter, void *context) } if (decref->complete) { - // We've already found the mapping for this decref, no fetch - // needed. + /* + * We've already found the mapping for this decref, no fetch + * needed. + */ return; } - init_vdo_page_completion(&decref->page_completion, + vdo_init_page_completion(&decref->page_completion, zone->page_cache, decref->slot.pbn, false, decref, process_fetched_page, handle_fetch_error); - get_vdo_page(&decref->page_completion.completion); + vdo_get_page(&decref->page_completion.completion); } /** - * Find all entries which need to be replayed into the slab journals. + * find_slab_journal_entries() - Find all entries which need to be replayed + * into the slab journals. * - * @param completion The sub-task completion - **/ + * @completion: The sub-task completion. + */ static void find_slab_journal_entries(struct vdo_completion *completion) { int result; @@ -1121,9 +1274,11 @@ static void find_slab_journal_entries(struct vdo_completion *completion) as_vdo_recovery_completion(completion->parent); struct vdo *vdo = recovery->vdo; - // We need to be on logical zone 0's thread since we are going to use - // its page cache. - assert_on_logical_zone_thread(vdo, 0, __func__); + /* + * We need to be on logical zone 0's thread since we are going to use + * its page cache. + */ + vdo_assert_on_logical_zone_thread(vdo, 0, __func__); result = find_missing_decrefs(recovery); if (abort_recovery_on_error(result, recovery)) { return; @@ -1131,8 +1286,8 @@ static void find_slab_journal_entries(struct vdo_completion *completion) prepare_sub_task(recovery, apply_to_depot, - finish_vdo_completion_parent_callback, - ZONE_TYPE_ADMIN); + vdo_finish_completion_parent_callback, + VDO_ZONE_TYPE_ADMIN); /* * Increment the incomplete_decref_count so that the fetch callback @@ -1140,25 +1295,26 @@ static void find_slab_journal_entries(struct vdo_completion *completion) * of missing decrefs. */ if (recovery->incomplete_decref_count++ > 0) { - // Fetch block map pages to fill in the incomplete missing - // decrefs. + /* + * Fetch block map pages to fill in the incomplete missing + * decrefs. + */ notify_all_waiters(&recovery->missing_decrefs[0], launch_fetch, - vdo_get_block_map_zone(get_block_map(vdo), 0)); + &vdo->block_map->zones[0]); } if (--recovery->incomplete_decref_count == 0) { - complete_vdo_completion(completion); + vdo_complete_completion(completion); } } /** - * Find the contiguous range of journal blocks. - * - * @param recovery The recovery completion + * find_contiguous_range() - Find the contiguous range of journal blocks. + * @recovery: The recovery completion. * - * @return true if there were valid journal blocks - **/ + * Return: true if there were valid journal blocks. + */ static bool find_contiguous_range(struct recovery_completion *recovery) { struct recovery_journal *journal = recovery->vdo->recovery_journal; @@ -1167,6 +1323,7 @@ static bool find_contiguous_range(struct recovery_completion *recovery) bool found_entries = false; sequence_number_t i; + for (i = head; i <= recovery->highest_tail; i++) { struct packed_journal_header *packed_header; struct recovery_block_header header; @@ -1181,30 +1338,34 @@ static bool find_contiguous_range(struct recovery_completion *recovery) }; packed_header = - get_vdo_recovery_journal_block_header(journal, + vdo_get_recovery_journal_block_header(journal, recovery->journal_data, i); - unpack_vdo_recovery_block_header(packed_header, &header); + vdo_unpack_recovery_block_header(packed_header, &header); - if (!is_exact_vdo_recovery_journal_block(journal, &header, i) || + if (!vdo_is_exact_recovery_journal_block(journal, &header, i) || (header.entry_count > journal->entries_per_block)) { - // A bad block header was found so this must be the end - // of the journal. + /* + * A bad block header was found so this must be the end + * of the journal. + */ break; } block_entries = header.entry_count; - // Examine each sector in turn to determine the last valid - // sector. + /* + * Examine each sector in turn to determine the last valid + * sector. + */ for (j = 1; j < VDO_SECTORS_PER_BLOCK; j++) { struct packed_journal_sector *sector = - get_vdo_journal_block_sector(packed_header, j); + vdo_get_journal_block_sector(packed_header, j); journal_entry_count_t sector_entries = min((journal_entry_count_t) sector->entry_count, block_entries); - // A bad sector means that this block was torn. - if (!is_valid_vdo_recovery_journal_sector(&header, + /* A bad sector means that this block was torn. */ + if (!vdo_is_valid_recovery_journal_sector(&header, sector)) { break; } @@ -1217,8 +1378,10 @@ static bool find_contiguous_range(struct recovery_completion *recovery) block_entries -= sector_entries; } - // If this sector is short, the later sectors can't - // matter. + /* + * If this sector is short, the later sectors can't + * matter. + */ if ((sector_entries < RECOVERY_JOURNAL_ENTRIES_PER_SECTOR) || (block_entries == 0)) { @@ -1226,15 +1389,17 @@ static bool find_contiguous_range(struct recovery_completion *recovery) } } - // If this block was not filled, or if it tore, no later block - // can matter. + /* + * If this block was not filled, or if it tore, no later block + * can matter. + */ if ((header.entry_count != journal->entries_per_block) || (block_entries > 0)) { break; } } - // Set the tail to the last valid tail block, if there is one. + /* Set the tail to the last valid tail block, if there is one. */ if (found_entries && (recovery->tail_recovery_point.sector_count == 0)) { recovery->tail--; @@ -1244,10 +1409,10 @@ static bool find_contiguous_range(struct recovery_completion *recovery) } /** - * Count the number of increment entries in the journal. - * - * @param recovery The recovery completion - **/ + * count_increment_entries() - Count the number of increment entries in the + * journal. + * @recovery: The recovery completion. + */ static int count_increment_entries(struct recovery_completion *recovery) { struct recovery_point recovery_point = { @@ -1260,14 +1425,14 @@ static int count_increment_entries(struct recovery_completion *recovery) struct recovery_journal_entry entry = get_entry(recovery, &recovery_point); int result = - validate_vdo_recovery_journal_entry(recovery->vdo, + vdo_validate_recovery_journal_entry(recovery->vdo, &entry); if (result != VDO_SUCCESS) { vdo_enter_read_only_mode(recovery->vdo->read_only_notifier, result); return result; } - if (is_vdo_journal_increment_operation(entry.operation)) { + if (vdo_is_journal_increment_operation(entry.operation)) { recovery->incref_count++; } increment_recovery_point(&recovery_point); @@ -1277,11 +1442,11 @@ static int count_increment_entries(struct recovery_completion *recovery) } /** - * Determine the limits of the valid recovery journal and prepare to replay - * into the slab journals and block map. - * - * @param completion The sub-task completion - **/ + * prepare_to_apply_journal_entries() - Determine the limits of the valid + * recovery journal and prepare to replay + * into the slab journals and block map. + * @completion: The sub-task completion. + */ static void prepare_to_apply_journal_entries(struct vdo_completion *completion) { bool found_entries; @@ -1290,9 +1455,10 @@ static void prepare_to_apply_journal_entries(struct vdo_completion *completion) as_vdo_recovery_completion(completion->parent); struct vdo *vdo = recovery->vdo; struct recovery_journal *journal = vdo->recovery_journal; + uds_log_info("Finished reading recovery journal"); found_entries = - find_vdo_recovery_journal_head_and_tail(journal, + vdo_find_recovery_journal_head_and_tail(journal, recovery->journal_data, &recovery->highest_tail, &recovery->block_map_head, @@ -1301,7 +1467,7 @@ static void prepare_to_apply_journal_entries(struct vdo_completion *completion) found_entries = find_contiguous_range(recovery); } - // Both reap heads must be behind the tail. + /* Both reap heads must be behind the tail. */ if ((recovery->block_map_head > recovery->tail) || (recovery->slab_journal_head > recovery->tail)) { result = uds_log_error_strerror(VDO_CORRUPT_JOURNAL, @@ -1309,21 +1475,21 @@ static void prepare_to_apply_journal_entries(struct vdo_completion *completion) (unsigned long long) recovery->block_map_head, (unsigned long long) recovery->slab_journal_head, (unsigned long long) recovery->tail); - finish_vdo_completion(&recovery->completion, result); + vdo_finish_completion(&recovery->completion, result); return; } if (!found_entries) { - // This message must be recognizable by VDOTest::RebuildBase. + /* This message must be recognizable by VDOTest::RebuildBase. */ uds_log_info("Replaying 0 recovery entries into block map"); - // We still need to load the slab_depot. + /* We still need to load the slab_depot. */ UDS_FREE(recovery->journal_data); recovery->journal_data = NULL; prepare_sub_task(recovery, - finish_vdo_completion_parent_callback, - finish_vdo_completion_parent_callback, - ZONE_TYPE_ADMIN); - load_vdo_slab_depot(get_slab_depot(vdo), + vdo_finish_completion_parent_callback, + vdo_finish_completion_parent_callback, + VDO_ZONE_TYPE_ADMIN); + vdo_load_slab_depot(vdo->depot, VDO_ADMIN_STATE_LOADING_FOR_RECOVERY, completion, recovery); @@ -1335,20 +1501,22 @@ static void prepare_to_apply_journal_entries(struct vdo_completion *completion) (unsigned long long) recovery->tail); if (is_replaying(vdo)) { - // We need to know how many entries the block map rebuild - // completion will need to hold. + /* + * We need to know how many entries the block map rebuild + * completion will need to hold. + */ result = count_increment_entries(recovery); if (result != VDO_SUCCESS) { - finish_vdo_completion(&recovery->completion, result); + vdo_finish_completion(&recovery->completion, result); return; } - // We need to access the block map from a logical zone. + /* We need to access the block map from a logical zone. */ prepare_sub_task(recovery, launch_block_map_recovery, - finish_vdo_completion_parent_callback, - ZONE_TYPE_LOGICAL); - load_vdo_slab_depot(vdo->depot, + vdo_finish_completion_parent_callback, + VDO_ZONE_TYPE_LOGICAL); + vdo_load_slab_depot(vdo->depot, VDO_ADMIN_STATE_LOADING_FOR_RECOVERY, completion, recovery); @@ -1362,36 +1530,45 @@ static void prepare_to_apply_journal_entries(struct vdo_completion *completion) prepare_sub_task(recovery, find_slab_journal_entries, - finish_vdo_completion_parent_callback, - ZONE_TYPE_LOGICAL); - invoke_vdo_completion_callback(completion); + vdo_finish_completion_parent_callback, + VDO_ZONE_TYPE_LOGICAL); + vdo_invoke_completion_callback(completion); } -/**********************************************************************/ +/** + * vdo_launch_recovery() - Construct a recovery completion and launch it. + * @vdo: The vdo to recover. + * @parent: The completion to notify when the offline portion of the recovery + * is complete. + * + * Applies all valid journal block entries to all vdo structures. This + * function performs the offline portion of recovering a vdo from a crash. + */ + void vdo_launch_recovery(struct vdo *vdo, struct vdo_completion *parent) { struct recovery_completion *recovery; int result; - // Note: This message must be recognizable by Permabit::VDODeviceBase. + /* Note: This message must be recognizable by Permabit::VDODeviceBase. */ uds_log_warning("Device was dirty, rebuilding reference counts"); - result = make_vdo_recovery_completion(vdo, &recovery); + result = vdo_make_recovery_completion(vdo, &recovery); if (result != VDO_SUCCESS) { - finish_vdo_completion(parent, result); + vdo_finish_completion(parent, result); return; } - prepare_vdo_completion(&recovery->completion, + vdo_prepare_completion(&recovery->completion, finish_recovery, abort_recovery, parent->callback_thread_id, parent); prepare_sub_task(recovery, prepare_to_apply_journal_entries, - finish_vdo_completion_parent_callback, - ZONE_TYPE_ADMIN); - load_vdo_recovery_journal(vdo->recovery_journal, + vdo_finish_completion_parent_callback, + VDO_ZONE_TYPE_ADMIN); + vdo_load_recovery_journal(vdo->recovery_journal, &recovery->sub_task_completion, &recovery->journal_data); } diff --git a/vdo/vdo-recovery.h b/vdo/vdo-recovery.h new file mode 100644 index 00000000..fdbe5293 --- /dev/null +++ b/vdo/vdo-recovery.h @@ -0,0 +1,18 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright Red Hat + */ + +#ifndef VDO_RECOVERY_H +#define VDO_RECOVERY_H + +#include "completion.h" +#include "vdo.h" + +void vdo_replay_into_slab_journals(struct block_allocator *allocator, + struct vdo_completion *completion, + void *context); + +void vdo_launch_recovery(struct vdo *vdo, struct vdo_completion *parent); + +#endif /* VDO_RECOVERY_H */ diff --git a/vdo/vdo-resize-logical.c b/vdo/vdo-resize-logical.c new file mode 100644 index 00000000..6a18e0b0 --- /dev/null +++ b/vdo/vdo-resize-logical.c @@ -0,0 +1,196 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright Red Hat + */ + +#include "vdo-resize-logical.h" + +#include "logger.h" + +#include "admin-completion.h" +#include "block-map.h" +#include "completion.h" +#include "kernel-types.h" +#include "thread-config.h" +#include "types.h" +#include "vdo.h" + +enum { + GROW_LOGICAL_PHASE_START, + GROW_LOGICAL_PHASE_GROW_BLOCK_MAP, + GROW_LOGICAL_PHASE_END, + GROW_LOGICAL_PHASE_ERROR, +}; + +static const char *GROW_LOGICAL_PHASE_NAMES[] = { + "GROW_LOGICAL_PHASE_START", + "GROW_LOGICAL_PHASE_GROW_BLOCK_MAP", + "GROW_LOGICAL_PHASE_END", + "GROW_LOGICAL_PHASE_ERROR", +}; + +/** + * get_thread_id_for_phase() - Implements vdo_thread_id_getter_for_phase. + */ +static thread_id_t __must_check +get_thread_id_for_phase(struct admin_completion *admin_completion) +{ + return admin_completion->vdo->thread_config->admin_thread; +} + +/** + * grow_logical_callback() - Callback to initiate a grow logical. + * @completion: The sub-task completion. + * + * Registered in vdo_perform_grow_logical(). + */ +static void grow_logical_callback(struct vdo_completion *completion) +{ + struct admin_completion *admin_completion = + vdo_admin_completion_from_sub_task(completion); + struct vdo *vdo = admin_completion->vdo; + + vdo_assert_admin_operation_type(admin_completion, + VDO_ADMIN_OPERATION_GROW_LOGICAL); + vdo_assert_admin_phase_thread(admin_completion, __func__, + GROW_LOGICAL_PHASE_NAMES); + + switch (admin_completion->phase++) { + case GROW_LOGICAL_PHASE_START: + if (vdo_is_read_only(vdo->read_only_notifier)) { + uds_log_error_strerror(VDO_READ_ONLY, + "Can't grow logical size of a read-only VDO"); + vdo_finish_completion(vdo_reset_admin_sub_task(completion), + VDO_READ_ONLY); + return; + } + + if (vdo_start_operation_with_waiter(&vdo->admin_state, + VDO_ADMIN_STATE_SUSPENDED_OPERATION, + &admin_completion->completion, + NULL)) { + vdo->states.vdo.config.logical_blocks + = vdo->block_map->next_entry_count; + vdo_save_components(vdo, + vdo_reset_admin_sub_task(completion)); + } + + return; + + case GROW_LOGICAL_PHASE_GROW_BLOCK_MAP: + vdo_grow_block_map(vdo->block_map, + vdo_reset_admin_sub_task(completion)); + return; + + case GROW_LOGICAL_PHASE_END: + break; + + case GROW_LOGICAL_PHASE_ERROR: + vdo_enter_read_only_mode(vdo->read_only_notifier, + completion->result); + break; + + default: + vdo_set_completion_result(vdo_reset_admin_sub_task(completion), + UDS_BAD_STATE); + } + + vdo_finish_operation(&vdo->admin_state, completion->result); +} + +/** + * handle_growth_error() - Handle an error during the grow physical process. + * @completion: The sub-task completion. + */ +static void handle_growth_error(struct vdo_completion *completion) +{ + struct admin_completion *admin_completion = + vdo_admin_completion_from_sub_task(completion); + if (admin_completion->phase == GROW_LOGICAL_PHASE_GROW_BLOCK_MAP) { + /* + * We've failed to write the new size in the super block, so set + * our in memory config back to the old size. + */ + struct vdo *vdo = admin_completion->vdo; + + vdo->states.vdo.config.logical_blocks + = vdo->block_map->entry_count; + vdo_abandon_block_map_growth(vdo->block_map); + } + + admin_completion->phase = GROW_LOGICAL_PHASE_ERROR; + grow_logical_callback(completion); +} + +/** + * vdo_perform_grow_logical() - Grow the logical size of the vdo. + * @vdo: The vdo to grow. + * @new_logical_blocks: The size to which the vdo should be grown. + * + * Context: This method may only be called when the vdo has been suspended and + * must not be called from a base thread. + * + * Return: VDO_SUCCESS or an error. + */ +int vdo_perform_grow_logical(struct vdo *vdo, block_count_t new_logical_blocks) +{ + int result; + + if (vdo->device_config->logical_blocks == new_logical_blocks) { + /* + * A table was loaded for which we prepared to grow, but + * a table without that growth was what we are resuming with. + */ + vdo_abandon_block_map_growth(vdo->block_map); + return VDO_SUCCESS; + } + + uds_log_info("Resizing logical to %llu", + (unsigned long long) new_logical_blocks); + + if (vdo->block_map->next_entry_count != new_logical_blocks) { + return VDO_PARAMETER_MISMATCH; + } + + result = vdo_perform_admin_operation(vdo, + VDO_ADMIN_OPERATION_GROW_LOGICAL, + get_thread_id_for_phase, + grow_logical_callback, + handle_growth_error); + if (result != VDO_SUCCESS) { + return result; + } + + uds_log_info("Logical blocks now %llu", + (unsigned long long) new_logical_blocks); + return VDO_SUCCESS; +} + +/** + * vdo_prepare_to_grow_logical() - Prepare to grow the logical size of vdo. + * @vdo: The vdo to prepare for growth. + * @new_logical_blocks: The size to which the vdo should be grown. + * + * Context: This method may only be called while the vdo is running. + * + * Return: VDO_SUCCESS or an error. + */ +int vdo_prepare_to_grow_logical(struct vdo *vdo, + block_count_t new_logical_blocks) +{ + block_count_t logical_blocks = vdo->states.vdo.config.logical_blocks; + int result; + + uds_log_info("Preparing to resize logical to %llu", + (unsigned long long) new_logical_blocks); + ASSERT_LOG_ONLY((new_logical_blocks > logical_blocks), + "New logical size is larger than current size"); + result = vdo_prepare_to_grow_block_map(vdo->block_map, + new_logical_blocks); + if (result != VDO_SUCCESS) { + return result; + } + + uds_log_info("Done preparing to resize logical"); + return VDO_SUCCESS; +} diff --git a/vdo/vdo-resize-logical.h b/vdo/vdo-resize-logical.h new file mode 100644 index 00000000..11a73b00 --- /dev/null +++ b/vdo/vdo-resize-logical.h @@ -0,0 +1,16 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright Red Hat + */ + +#ifndef VDO_RESIZE_LOGICAL_H +#define VDO_RESIZE_LOGICAL_H + +#include "kernel-types.h" +#include "types.h" + +int vdo_perform_grow_logical(struct vdo *vdo, block_count_t new_logical_blocks); + +int vdo_prepare_to_grow_logical(struct vdo *vdo, block_count_t new_logical_blocks); + +#endif /* VDO_RESIZE_LOGICAL_H */ diff --git a/vdo/vdo-resize.c b/vdo/vdo-resize.c new file mode 100644 index 00000000..e48f4cb1 --- /dev/null +++ b/vdo/vdo-resize.c @@ -0,0 +1,272 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright Red Hat + */ + +#include "vdo-resize.h" + +#include "logger.h" + +#include "admin-completion.h" +#include "completion.h" +#include "kernel-types.h" +#include "recovery-journal.h" +#include "slab-depot.h" +#include "slab-summary.h" +#include "thread-config.h" +#include "types.h" +#include "vdo.h" +#include "vdo-layout.h" + +enum { + GROW_PHYSICAL_PHASE_START, + GROW_PHYSICAL_PHASE_COPY_SUMMARY, + GROW_PHYSICAL_PHASE_UPDATE_COMPONENTS, + GROW_PHYSICAL_PHASE_USE_NEW_SLABS, + GROW_PHYSICAL_PHASE_END, + GROW_PHYSICAL_PHASE_ERROR, +}; + +static const char *GROW_PHYSICAL_PHASE_NAMES[] = { + "GROW_PHYSICAL_PHASE_START", + "GROW_PHYSICAL_PHASE_COPY_SUMMARY", + "GROW_PHYSICAL_PHASE_UPDATE_COMPONENTS", + "GROW_PHYSICAL_PHASE_USE_NEW_SLABS", + "GROW_PHYSICAL_PHASE_END", + "GROW_PHYSICAL_PHASE_ERROR", +}; + +/** + * get_thread_id_for_phase() - Implements vdo_thread_id_getter_for_phase. + */ +static thread_id_t __must_check +get_thread_id_for_phase(struct admin_completion *admin_completion) +{ + return admin_completion->vdo->thread_config->admin_thread; +} + +/** + * grow_physical_callback() - Callback to initiate a grow physical. + * @completion: The sub-task completion. + * + * Registered in vdo_perform_grow_physical(). + */ +static void grow_physical_callback(struct vdo_completion *completion) +{ + struct admin_completion *admin_completion = + vdo_admin_completion_from_sub_task(completion); + struct vdo *vdo = admin_completion->vdo; + + vdo_assert_admin_operation_type(admin_completion, + VDO_ADMIN_OPERATION_GROW_PHYSICAL); + vdo_assert_admin_phase_thread(admin_completion, __func__, + GROW_PHYSICAL_PHASE_NAMES); + + switch (admin_completion->phase++) { + case GROW_PHYSICAL_PHASE_START: + if (vdo_is_read_only(vdo->read_only_notifier)) { + uds_log_error_strerror(VDO_READ_ONLY, + "Can't grow physical size of a read-only VDO"); + vdo_set_completion_result(vdo_reset_admin_sub_task(completion), + VDO_READ_ONLY); + break; + } + + if (vdo_start_operation_with_waiter(&vdo->admin_state, + VDO_ADMIN_STATE_SUSPENDED_OPERATION, + &admin_completion->completion, + NULL)) { + /* Copy the journal into the new layout. */ + vdo_copy_layout_partition(vdo->layout, + VDO_RECOVERY_JOURNAL_PARTITION, + vdo_reset_admin_sub_task(completion)); + } + return; + + case GROW_PHYSICAL_PHASE_COPY_SUMMARY: + vdo_copy_layout_partition(vdo->layout, + VDO_SLAB_SUMMARY_PARTITION, + vdo_reset_admin_sub_task(completion)); + return; + + case GROW_PHYSICAL_PHASE_UPDATE_COMPONENTS: + vdo->states.vdo.config.physical_blocks = + vdo_grow_layout(vdo->layout); + vdo_update_slab_depot_size(vdo->depot); + vdo_save_components(vdo, vdo_reset_admin_sub_task(completion)); + return; + + case GROW_PHYSICAL_PHASE_USE_NEW_SLABS: + vdo_use_new_slabs(vdo->depot, vdo_reset_admin_sub_task(completion)); + return; + + case GROW_PHYSICAL_PHASE_END: + vdo_set_slab_summary_origin(vdo_get_slab_summary(vdo->depot), + vdo_get_partition(vdo->layout, + VDO_SLAB_SUMMARY_PARTITION)); + vdo_set_recovery_journal_partition(vdo->recovery_journal, + vdo_get_partition(vdo->layout, + VDO_RECOVERY_JOURNAL_PARTITION)); + break; + + case GROW_PHYSICAL_PHASE_ERROR: + vdo_enter_read_only_mode(vdo->read_only_notifier, + completion->result); + break; + + default: + vdo_set_completion_result(vdo_reset_admin_sub_task(completion), + UDS_BAD_STATE); + } + + vdo_finish_layout_growth(vdo->layout); + vdo_finish_operation(&vdo->admin_state, completion->result); +} + +/** + * handle_growth_error() - Handle an error during the grow physical process. + * @completion: The sub-task completion. + */ +static void handle_growth_error(struct vdo_completion *completion) +{ + vdo_admin_completion_from_sub_task(completion)->phase = + GROW_PHYSICAL_PHASE_ERROR; + grow_physical_callback(completion); +} + +/** + * vdo_perform_grow_physical() - Grow the physical size of the vdo. + * @vdo: The vdo to resize. + * @new_physical_blocks: The new physical size in blocks. + * + * Context: This method may only be called when the vdo has been suspended and + * must not be called from a base thread. + * + * Return: VDO_SUCCESS or an error. + */ +int vdo_perform_grow_physical(struct vdo *vdo, + block_count_t new_physical_blocks) +{ + int result; + block_count_t new_depot_size, prepared_depot_size; + + block_count_t old_physical_blocks = + vdo->states.vdo.config.physical_blocks; + + /* Skip any noop grows. */ + if (old_physical_blocks == new_physical_blocks) { + return VDO_SUCCESS; + } + + if (new_physical_blocks != vdo_get_next_layout_size(vdo->layout)) { + /* + * Either the VDO isn't prepared to grow, or it was prepared to + * grow to a different size. Doing this check here relies on + * the fact that the call to this method is done under the + * dmsetup message lock. + */ + vdo_finish_layout_growth(vdo->layout); + vdo_abandon_new_slabs(vdo->depot); + return VDO_PARAMETER_MISMATCH; + } + + /* Validate that we are prepared to grow appropriately. */ + new_depot_size = + vdo_get_next_block_allocator_partition_size(vdo->layout); + prepared_depot_size = vdo_get_slab_depot_new_size(vdo->depot); + if (prepared_depot_size != new_depot_size) { + return VDO_PARAMETER_MISMATCH; + } + + result = vdo_perform_admin_operation(vdo, + VDO_ADMIN_OPERATION_GROW_PHYSICAL, + get_thread_id_for_phase, + grow_physical_callback, + handle_growth_error); + if (result != VDO_SUCCESS) { + return result; + } + + uds_log_info("Physical block count was %llu, now %llu", + (unsigned long long) old_physical_blocks, + (unsigned long long) new_physical_blocks); + return VDO_SUCCESS; +} + +/** + * check_may_grow_physical() - Callback to check that we're not in recovery + * mode, used in vdo_prepare_to_grow_physical(). + * @completion: The sub-task completion. + */ +static void check_may_grow_physical(struct vdo_completion *completion) +{ + struct admin_completion *admin_completion = + vdo_admin_completion_from_sub_task(completion); + struct vdo *vdo = admin_completion->vdo; + + vdo_assert_admin_operation_type(admin_completion, + VDO_ADMIN_OPERATION_PREPARE_GROW_PHYSICAL); + vdo_assert_on_admin_thread(vdo, __func__); + + vdo_reset_admin_sub_task(completion); + + /* This check can only be done from a base code thread. */ + if (vdo_is_read_only(vdo->read_only_notifier)) { + vdo_finish_completion(completion->parent, VDO_READ_ONLY); + return; + } + + /* This check should only be done from a base code thread. */ + if (vdo_in_recovery_mode(vdo)) { + vdo_finish_completion(completion->parent, VDO_RETRY_AFTER_REBUILD); + return; + } + + vdo_complete_completion(completion->parent); +} + +/** + * vdo_prepare_to_grow_physical() - Prepare to resize the vdo, allocating + * memory as needed. + * @vdo: The vdo. + * @new_physical_blocks: The new physical size in blocks. + */ +int vdo_prepare_to_grow_physical(struct vdo *vdo, + block_count_t new_physical_blocks) +{ + int result; + block_count_t new_depot_size; + block_count_t current_physical_blocks = + vdo->states.vdo.config.physical_blocks; + + uds_log_info("Preparing to resize physical to %llu", + new_physical_blocks); + ASSERT_LOG_ONLY((new_physical_blocks > current_physical_blocks), + "New physical size is larger than current physical size"); + result = vdo_perform_admin_operation(vdo, + VDO_ADMIN_OPERATION_PREPARE_GROW_PHYSICAL, + get_thread_id_for_phase, + check_may_grow_physical, + vdo_finish_completion_parent_callback); + if (result != VDO_SUCCESS) { + return result; + } + + result = prepare_to_vdo_grow_layout(vdo->layout, + current_physical_blocks, + new_physical_blocks); + if (result != VDO_SUCCESS) { + return result; + } + + new_depot_size = + vdo_get_next_block_allocator_partition_size(vdo->layout); + result = vdo_prepare_to_grow_slab_depot(vdo->depot, new_depot_size); + if (result != VDO_SUCCESS) { + vdo_finish_layout_growth(vdo->layout); + return result; + } + + uds_log_info("Done preparing to resize physical"); + return VDO_SUCCESS; +} diff --git a/vdo/vdo-resize.h b/vdo/vdo-resize.h new file mode 100644 index 00000000..145e8411 --- /dev/null +++ b/vdo/vdo-resize.h @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright Red Hat + */ + +#ifndef VDO_RESIZE_H +#define VDO_RESIZE_H + +#include "kernel-types.h" +#include "types.h" + +int vdo_perform_grow_physical(struct vdo *vdo, + block_count_t new_physical_blocks); + +int __must_check +vdo_prepare_to_grow_physical(struct vdo *vdo, + block_count_t new_physical_blocks); + +#endif /* VDO_RESIZE_H */ diff --git a/vdo/vdo-resume.c b/vdo/vdo-resume.c new file mode 100644 index 00000000..030ee382 --- /dev/null +++ b/vdo/vdo-resume.c @@ -0,0 +1,298 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright Red Hat + */ + +#include "vdo-resume.h" + +#include + +#include "logger.h" + +#include "admin-completion.h" +#include "block-map.h" +#include "completion.h" +#include "data-vio-pool.h" +#include "dedupe-index.h" +#include "kernel-types.h" +#include "logical-zone.h" +#include "recovery-journal.h" +#include "slab-depot.h" +#include "slab-summary.h" +#include "thread-config.h" +#include "types.h" +#include "vdo.h" +#include "vdo-resize.h" +#include "vdo-resize-logical.h" + +enum { + RESUME_PHASE_START, + RESUME_PHASE_ALLOW_READ_ONLY_MODE, + RESUME_PHASE_INDEX, + RESUME_PHASE_DEPOT, + RESUME_PHASE_JOURNAL, + RESUME_PHASE_BLOCK_MAP, + RESUME_PHASE_LOGICAL_ZONES, + RESUME_PHASE_PACKER, + RESUME_PHASE_FLUSHER, + RESUME_PHASE_DATA_VIOS, + RESUME_PHASE_END, +}; + +static const char *RESUME_PHASE_NAMES[] = { + "RESUME_PHASE_START", + "RESUME_PHASE_ALLOW_READ_ONLY_MODE", + "RESUME_PHASE_INDEX", + "RESUME_PHASE_DEPOT", + "RESUME_PHASE_JOURNAL", + "RESUME_PHASE_BLOCK_MAP", + "RESUME_PHASE_LOGICAL_ZONES", + "RESUME_PHASE_PACKER", + "RESUME_PHASE_FLUSHER", + "RESUME_PHASE_DATA_VIOS", + "RESUME_PHASE_END", +}; + +/** + * get_thread_id_for_phase() - Implements vdo_thread_id_getter_for_phase. + */ +static thread_id_t __must_check +get_thread_id_for_phase(struct admin_completion *admin_completion) +{ + const struct thread_config *thread_config = + admin_completion->vdo->thread_config; + switch (admin_completion->phase) { + case RESUME_PHASE_JOURNAL: + return thread_config->journal_thread; + + case RESUME_PHASE_PACKER: + case RESUME_PHASE_FLUSHER: + return thread_config->packer_thread; + + case RESUME_PHASE_DATA_VIOS: + return thread_config->cpu_thread; + default: + return thread_config->admin_thread; + } +} + +/** + * write_super_block() - Update the VDO state and save the super block. + * @vdo: The vdo being resumed. + * @completion: The admin_completion's sub-task completion. + */ +static void write_super_block(struct vdo *vdo, + struct vdo_completion *completion) +{ + switch (vdo_get_state(vdo)) { + case VDO_CLEAN: + case VDO_NEW: + vdo_set_state(vdo, VDO_DIRTY); + vdo_save_components(vdo, completion); + return; + + case VDO_DIRTY: + case VDO_READ_ONLY_MODE: + case VDO_FORCE_REBUILD: + case VDO_RECOVERING: + case VDO_REBUILD_FOR_UPGRADE: + /* No need to write the super block in these cases */ + vdo_complete_completion(completion); + return; + + case VDO_REPLAYING: + default: + vdo_finish_completion(completion, UDS_BAD_STATE); + } +} + +/** + * resume_callback() - Callback to resume a VDO. + * @completion: The sub-task completion. + */ +static void resume_callback(struct vdo_completion *completion) +{ + struct admin_completion *admin_completion = + vdo_admin_completion_from_sub_task(completion); + struct vdo *vdo = admin_completion->vdo; + + vdo_assert_admin_operation_type(admin_completion, + VDO_ADMIN_OPERATION_RESUME); + vdo_assert_admin_phase_thread(admin_completion, __func__, + RESUME_PHASE_NAMES); + + switch (admin_completion->phase++) { + case RESUME_PHASE_START: + if (vdo_start_resuming(&vdo->admin_state, + VDO_ADMIN_STATE_RESUMING, + &admin_completion->completion, + NULL)) { + write_super_block(vdo, completion); + } + return; + + case RESUME_PHASE_ALLOW_READ_ONLY_MODE: + vdo_allow_read_only_mode_entry(vdo->read_only_notifier, + vdo_reset_admin_sub_task(completion)); + return; + + case RESUME_PHASE_INDEX: + if (!vdo_is_read_only(vdo->read_only_notifier)) { + vdo_resume_dedupe_index(vdo->dedupe_index, + vdo->device_config); + } + + vdo_complete_completion(vdo_reset_admin_sub_task(completion)); + return; + + case RESUME_PHASE_DEPOT: + vdo_resume_slab_depot(vdo->depot, vdo_reset_admin_sub_task(completion)); + return; + + case RESUME_PHASE_JOURNAL: + vdo_resume_recovery_journal(vdo->recovery_journal, + vdo_reset_admin_sub_task(completion)); + return; + + case RESUME_PHASE_BLOCK_MAP: + vdo_resume_block_map(vdo->block_map, + vdo_reset_admin_sub_task(completion)); + return; + + case RESUME_PHASE_LOGICAL_ZONES: + vdo_resume_logical_zones(vdo->logical_zones, + vdo_reset_admin_sub_task(completion)); + return; + + case RESUME_PHASE_PACKER: + { + bool was_enabled = vdo_get_compressing(vdo); + bool enable = vdo->device_config->compression; + + if (enable != was_enabled) { + WRITE_ONCE(vdo->compressing, enable); + } + uds_log_info("compression is %s", + (enable ? "enabled" : "disabled")); + + vdo_resume_packer(vdo->packer, + vdo_reset_admin_sub_task(completion)); + return; + } + + case RESUME_PHASE_FLUSHER: + vdo_resume_flusher(vdo->flusher, + vdo_reset_admin_sub_task(completion)); + return; + + case RESUME_PHASE_DATA_VIOS: + resume_data_vio_pool(vdo->data_vio_pool, + vdo_reset_admin_sub_task(completion)); + return; + + case RESUME_PHASE_END: + break; + + default: + vdo_set_completion_result(vdo_reset_admin_sub_task(completion), + UDS_BAD_STATE); + } + + vdo_finish_resuming_with_result(&vdo->admin_state, completion->result); +} + +/** + * apply_new_vdo_configuration() - Attempt to make any configuration changes + * from the table being resumed. + * @vdo: The vdo being resumed. + * @config: The new device configuration derived from the table with which + * the vdo is being resumed. + * + * Return: VDO_SUCCESS or an error. + */ +static int __must_check +apply_new_vdo_configuration(struct vdo *vdo, struct device_config *config) +{ + int result; + + result = vdo_perform_grow_logical(vdo, config->logical_blocks); + if (result != VDO_SUCCESS) { + uds_log_error("grow logical operation failed, result = %d", + result); + return result; + } + + result = vdo_perform_grow_physical(vdo, config->physical_blocks); + if (result != VDO_SUCCESS) { + uds_log_error("resize operation failed, result = %d", result); + } + + return result; +} + +/** + * vdo_preresume_internal() - Resume a suspended vdo (technically preresume + * because resume can't fail). + * @vdo: The vdo being resumed. + * @config: The device config derived from the table with which the vdo is + * being resumed. + * @device_name: The vdo device name (for logging). + * + * Return: VDO_SUCCESS or an error. + */ +int vdo_preresume_internal(struct vdo *vdo, + struct device_config *config, + const char *device_name) +{ + int result; + + /* + * If this fails, the VDO was not in a state to be resumed. This should + * never happen. + */ + result = apply_new_vdo_configuration(vdo, config); + BUG_ON(result == VDO_INVALID_ADMIN_STATE); + + /* + * Now that we've tried to modify the vdo, the new config *is* the + * config, whether the modifications worked or not. + */ + vdo->device_config = config; + + /* + * Any error here is highly unexpected and the state of the vdo is + * questionable, so we mark it read-only in memory. Because we are + * suspended, the read-only state will not be written to disk. + */ + if (result != VDO_SUCCESS) { + uds_log_error_strerror(result, + "Commit of modifications to device '%s' failed", + device_name); + vdo_enter_read_only_mode(vdo->read_only_notifier, result); + return result; + } + + if (vdo_get_admin_state(vdo)->normal) { + /* The VDO was just started, so we don't need to resume it. */ + return VDO_SUCCESS; + } + + result = vdo_perform_admin_operation(vdo, + VDO_ADMIN_OPERATION_RESUME, + get_thread_id_for_phase, + resume_callback, + vdo_preserve_completion_error_and_continue); + BUG_ON(result == VDO_INVALID_ADMIN_STATE); + if (result == VDO_READ_ONLY) { + /* Even if the vdo is read-only, it has still resumed. */ + result = VDO_SUCCESS; + } + + if (result != VDO_SUCCESS) { + uds_log_error("resume of device '%s' failed with error: %d", + device_name, + result); + } + + return result; +} diff --git a/vdo/vdo-resume.h b/vdo/vdo-resume.h new file mode 100644 index 00000000..9df60381 --- /dev/null +++ b/vdo/vdo-resume.h @@ -0,0 +1,16 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright Red Hat + */ + +#ifndef VDO_RESUME_H +#define VDO_RESUME_H + +#include "kernel-types.h" +#include "types.h" + +int vdo_preresume_internal(struct vdo *vdo, + struct device_config *config, + const char *device_name); + +#endif /* VDO_RESUME_H */ diff --git a/vdo/vdo-suspend.c b/vdo/vdo-suspend.c new file mode 100644 index 00000000..a37f373e --- /dev/null +++ b/vdo/vdo-suspend.c @@ -0,0 +1,279 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright Red Hat + */ + +#include "vdo-suspend.h" + +#include "logger.h" +#include "permassert.h" + +#include "admin-completion.h" +#include "block-map.h" +#include "completion.h" +#include "data-vio-pool.h" +#include "dedupe-index.h" +#include "kernel-types.h" +#include "logical-zone.h" +#include "recovery-journal.h" +#include "slab-depot.h" +#include "slab-summary.h" +#include "thread-config.h" +#include "types.h" +#include "vdo.h" + +enum { + SUSPEND_PHASE_START, + SUSPEND_PHASE_PACKER, + SUSPEND_PHASE_DATA_VIOS, + SUSPEND_PHASE_FLUSHES, + SUSPEND_PHASE_LOGICAL_ZONES, + SUSPEND_PHASE_BLOCK_MAP, + SUSPEND_PHASE_JOURNAL, + SUSPEND_PHASE_DEPOT, + SUSPEND_PHASE_READ_ONLY_WAIT, + SUSPEND_PHASE_WRITE_SUPER_BLOCK, + SUSPEND_PHASE_END, +}; + +static const char *SUSPEND_PHASE_NAMES[] = { + "SUSPEND_PHASE_START", + "SUSPEND_PHASE_PACKER", + "SUSPEND_PHASE_DATA_VIOS", + "SUSPEND_PHASE_FLUSHES", + "SUSPEND_PHASE_LOGICAL_ZONES", + "SUSPEND_PHASE_BLOCK_MAP", + "SUSPEND_PHASE_JOURNAL", + "SUSPEND_PHASE_DEPOT", + "SUSPEND_PHASE_READ_ONLY_WAIT", + "SUSPEND_PHASE_WRITE_SUPER_BLOCK", + "SUSPEND_PHASE_END", +}; + +/** + * get_thread_id_for_phase() - Implements vdo_thread_id_getter_for_phase. + */ +static thread_id_t __must_check +get_thread_id_for_phase(struct admin_completion *admin_completion) +{ + const struct thread_config *thread_config = + admin_completion->vdo->thread_config; + switch (admin_completion->phase) { + case SUSPEND_PHASE_PACKER: + case SUSPEND_PHASE_FLUSHES: + return thread_config->packer_thread; + + case SUSPEND_PHASE_DATA_VIOS: + return thread_config->cpu_thread; + + case SUSPEND_PHASE_JOURNAL: + return thread_config->journal_thread; + + default: + return thread_config->admin_thread; + } +} + +/** + * write_super_block() - Update the VDO state and save the super block. + * @vdo: The vdo being suspended. + * @completion: The admin_completion's sub-task completion. + */ +static void write_super_block(struct vdo *vdo, + struct vdo_completion *completion) +{ + switch (vdo_get_state(vdo)) { + case VDO_DIRTY: + case VDO_NEW: + vdo_set_state(vdo, VDO_CLEAN); + break; + + case VDO_CLEAN: + case VDO_READ_ONLY_MODE: + case VDO_FORCE_REBUILD: + case VDO_RECOVERING: + case VDO_REBUILD_FOR_UPGRADE: + break; + + case VDO_REPLAYING: + default: + vdo_finish_completion(completion, UDS_BAD_STATE); + return; + } + + vdo_save_components(vdo, completion); +} + +/** + * suspend_callback() - Callback to initiate a suspend, registered in + * vdo_suspend(). + * @completion: The sub-task completion. + */ +static void suspend_callback(struct vdo_completion *completion) +{ + struct admin_completion *admin_completion = + vdo_admin_completion_from_sub_task(completion); + struct vdo *vdo = admin_completion->vdo; + struct admin_state *admin_state = &vdo->admin_state; + int result; + + vdo_assert_admin_operation_type(admin_completion, + VDO_ADMIN_OPERATION_SUSPEND); + vdo_assert_admin_phase_thread(admin_completion, __func__, + SUSPEND_PHASE_NAMES); + + switch (admin_completion->phase++) { + case SUSPEND_PHASE_START: + if (vdo_start_draining(admin_state, + vdo->suspend_type, + &admin_completion->completion, + NULL)) { + vdo_complete_completion(vdo_reset_admin_sub_task(completion)); + } + return; + + case SUSPEND_PHASE_PACKER: + /* + * If the VDO was already resumed from a prior suspend while + * read-only, some of the components may not have been resumed. + * By setting a read-only error here, we guarantee that the + * result of this suspend will be VDO_READ_ONLY and not + * VDO_INVALID_ADMIN_STATE in that case. + */ + if (vdo_in_read_only_mode(vdo)) { + vdo_set_completion_result(&admin_completion->completion, + VDO_READ_ONLY); + } + + vdo_drain_packer(vdo->packer, + vdo_reset_admin_sub_task(completion)); + return; + + case SUSPEND_PHASE_DATA_VIOS: + drain_data_vio_pool(vdo->data_vio_pool, + vdo_reset_admin_sub_task(completion)); + return; + + case SUSPEND_PHASE_FLUSHES: + /* + * Now that we know there are no active data_vios, we can + * suspend the index. We can do this from any thread, so here + * is as good a place as any. + */ + vdo_suspend_dedupe_index(vdo->dedupe_index, + (vdo->suspend_type + == VDO_ADMIN_STATE_SAVING)); + vdo_drain_flusher(vdo->flusher, + vdo_reset_admin_sub_task(completion)); + return; + + case SUSPEND_PHASE_LOGICAL_ZONES: + /* + * Attempt to flush all I/O before completing post suspend + * work. We believe a suspended device is expected to have + * persisted all data written before the suspend, even if it + * hasn't been flushed yet. + */ + result = vdo_synchronous_flush(vdo); + if (result != VDO_SUCCESS) { + vdo_enter_read_only_mode(vdo->read_only_notifier, + result); + } + + vdo_drain_logical_zones(vdo->logical_zones, + vdo_get_admin_state_code(admin_state), + vdo_reset_admin_sub_task(completion)); + return; + + case SUSPEND_PHASE_BLOCK_MAP: + vdo_drain_block_map(vdo->block_map, + vdo_get_admin_state_code(admin_state), + vdo_reset_admin_sub_task(completion)); + return; + + case SUSPEND_PHASE_JOURNAL: + vdo_drain_recovery_journal(vdo->recovery_journal, + vdo_get_admin_state_code(admin_state), + vdo_reset_admin_sub_task(completion)); + return; + + case SUSPEND_PHASE_DEPOT: + vdo_drain_slab_depot(vdo->depot, + vdo_get_admin_state_code(admin_state), + vdo_reset_admin_sub_task(completion)); + return; + + case SUSPEND_PHASE_READ_ONLY_WAIT: + vdo_wait_until_not_entering_read_only_mode(vdo->read_only_notifier, + vdo_reset_admin_sub_task(completion)); + return; + + case SUSPEND_PHASE_WRITE_SUPER_BLOCK: + if (vdo_is_state_suspending(admin_state) || + (admin_completion->completion.result != VDO_SUCCESS)) { + /* + * If we didn't save the VDO or there was an error, + * we're done. + */ + break; + } + + write_super_block(vdo, vdo_reset_admin_sub_task(completion)); + return; + + case SUSPEND_PHASE_END: + break; + + default: + vdo_set_completion_result(completion, UDS_BAD_STATE); + } + + vdo_finish_draining_with_result(admin_state, completion->result); +} + +/** + * vdo_suspend() - Ensure that the vdo has no outstanding I/O and will issue + * none until it is resumed. + * @vdo: The vdo to suspend. + * + * Return: VDO_SUCCESS or an error. + */ +int vdo_suspend(struct vdo *vdo) +{ + const char *device_name; + int result; + + device_name = vdo_get_device_name(vdo->device_config->owning_target); + uds_log_info("suspending device '%s'", device_name); + + /* + * It's important to note any error here does not actually stop + * device-mapper from suspending the device. All this work is done + * post suspend. + */ + result = vdo_perform_admin_operation(vdo, + VDO_ADMIN_OPERATION_SUSPEND, + get_thread_id_for_phase, + suspend_callback, + vdo_preserve_completion_error_and_continue); + + /* + * Treat VDO_READ_ONLY as a success since a read-only suspension still + * leaves the VDO suspended. + */ + if ((result == VDO_SUCCESS) || (result == VDO_READ_ONLY)) { + uds_log_info("device '%s' suspended", device_name); + return VDO_SUCCESS; + } + + if (result == VDO_INVALID_ADMIN_STATE) { + uds_log_error("Suspend invoked while in unexpected state: %s", + vdo_get_admin_state(vdo)->name); + result = -EINVAL; + } + + uds_log_error_strerror(result, + "Suspend of device '%s' failed", + device_name); + return result; +} diff --git a/vdo/vdo-suspend.h b/vdo/vdo-suspend.h new file mode 100644 index 00000000..50ceeae2 --- /dev/null +++ b/vdo/vdo-suspend.h @@ -0,0 +1,13 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright Red Hat + */ + +#ifndef VDO_SUSPEND_H +#define VDO_SUSPEND_H + +#include "kernel-types.h" + +int vdo_suspend(struct vdo *vdo); + +#endif /* VDO_SUSPEND_H */ diff --git a/vdo/vdo.c b/vdo/vdo.c index 9af9cca2..005810e6 100644 --- a/vdo/vdo.c +++ b/vdo/vdo.c @@ -1,22 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/vdo.c#111 $ */ /* @@ -24,72 +8,382 @@ * well as functions for constructing and destroying vdo instances (in memory). */ -#include "vdoInternal.h" +#include "vdo.h" #include +#include +#include +#include +#include #include "logger.h" -#include "memoryAlloc.h" +#include "memory-alloc.h" #include "permassert.h" -#include "blockMap.h" -#include "deviceRegistry.h" -#include "hashZone.h" +#include "bio.h" +#include "block-map.h" +#include "data-vio-pool.h" +#include "dedupe-index.h" +#include "device-registry.h" +#include "hash-zone.h" #include "header.h" -#include "instanceNumber.h" -#include "logicalZone.h" -#include "numUtils.h" +#include "instance-number.h" +#include "io-submitter.h" +#include "logical-zone.h" +#include "num-utils.h" #include "packer.h" -#include "physicalZone.h" -#include "poolSysfs.h" -#include "readOnlyNotifier.h" -#include "recoveryJournal.h" -#include "releaseVersions.h" -#include "slabDepot.h" -#include "slabSummary.h" +#include "physical-zone.h" +#include "pool-sysfs.h" +#include "read-only-notifier.h" +#include "recovery-journal.h" +#include "release-versions.h" +#include "slab-depot.h" +#include "slab-summary.h" #include "statistics.h" -#include "statusCodes.h" -#include "superBlock.h" -#include "superBlockCodec.h" -#include "syncCompletion.h" -#include "threadConfig.h" -#include "vdoComponentStates.h" -#include "vdoLayout.h" - -#include "bio.h" -#include "dedupeIndex.h" -#include "ioSubmitter.h" -#include "kernelVDO.h" -#include "vdoCommon.h" +#include "status-codes.h" +#include "super-block.h" +#include "super-block-codec.h" +#include "sync-completion.h" +#include "thread-config.h" +#include "vdo-component-states.h" +#include "vdo-layout.h" +#include "vdo-resize.h" +#include "vdo-resize-logical.h" #include "workQueue.h" -/**********************************************************************/ + +enum { PARANOID_THREAD_CONSISTENCY_CHECKS = 0 }; + +static void start_vdo_request_queue(void *ptr) +{ + struct vdo_thread *thread + = get_work_queue_owner(get_current_work_queue()); + + uds_register_allocating_thread(&thread->allocating_thread, + &thread->vdo->allocations_allowed); +} + +static void finish_vdo_request_queue(void *ptr) +{ + uds_unregister_allocating_thread(); +} + +#ifdef MODULE +#define MODULE_NAME THIS_MODULE->name +#else +#define MODULE_NAME "dm-vdo" +#endif /* MODULE */ + +static const struct vdo_work_queue_type default_queue_type = { + .start = start_vdo_request_queue, + .finish = finish_vdo_request_queue, + .max_priority = VDO_DEFAULT_Q_MAX_PRIORITY, + .default_priority = VDO_DEFAULT_Q_COMPLETION_PRIORITY, +}; + +static const struct vdo_work_queue_type bio_ack_q_type = { + .start = NULL, + .finish = NULL, + .max_priority = BIO_ACK_Q_MAX_PRIORITY, + .default_priority = BIO_ACK_Q_ACK_PRIORITY, +}; + +static const struct vdo_work_queue_type cpu_q_type = { + .start = NULL, + .finish = NULL, + .max_priority = CPU_Q_MAX_PRIORITY, + .default_priority = CPU_Q_MAX_PRIORITY, +}; + +/** + * vdo_make_thread() - Construct a single vdo work_queue and its associated + * thread (or threads for round-robin queues). + * @vdo: The vdo which owns the thread. + * @thread_id: The id of the thread to create (as determined by the + * thread_config). + * @type: The description of the work queue for this thread. + * @queue_count: The number of actual threads/queues contained in the "thread". + * @contexts: An array of queue_count contexts, one for each individual queue; + * may be NULL. + * + * Each "thread" constructed by this method is represented by a unique thread + * id in the thread config, and completions can be enqueued to the queue and + * run on the threads comprising this entity. + * + * Return: VDO_SUCCESS or an error. + */ +int vdo_make_thread(struct vdo *vdo, + thread_id_t thread_id, + const struct vdo_work_queue_type *type, + unsigned int queue_count, + void *contexts[]) +{ + struct vdo_thread *thread = &vdo->threads[thread_id]; + char queue_name[MAX_VDO_WORK_QUEUE_NAME_LEN]; + + if (type == NULL) { + type = &default_queue_type; + } + + if (thread->queue != NULL) { + return ASSERT(vdo_work_queue_type_is(thread->queue, type), + "already constructed vdo thread %u is of the correct type", + thread_id); + } + + thread->vdo = vdo; + thread->thread_id = thread_id; + vdo_get_thread_name(vdo->thread_config, + thread_id, + queue_name, + sizeof(queue_name)); + return make_work_queue(vdo->thread_name_prefix, + queue_name, + thread, + type, + queue_count, + contexts, + &thread->queue); +} + +/** + * initialize_vdo() - Do the portion of initializing a vdo which will clean + * up after itself on error. + * @vdo: The vdo being initialized + * @config: The configuration of the vdo + * @instance: The instance number of the vdo + * @reason: The buffer to hold the failure reason on error + **/ +static int initialize_vdo(struct vdo *vdo, + struct device_config *config, + unsigned int instance, + char **reason) +{ + int result; + zone_count_t i; + + vdo->device_config = config; + vdo->starting_sector_offset = config->owning_target->begin; + vdo->instance = instance; + vdo->allocations_allowed = true; + vdo_set_admin_state_code(&vdo->admin_state, VDO_ADMIN_STATE_NEW); + INIT_LIST_HEAD(&vdo->device_config_list); + vdo_initialize_admin_completion(vdo, &vdo->admin_completion); + mutex_init(&vdo->stats_mutex); + result = vdo_read_geometry_block(vdo_get_backing_device(vdo), + &vdo->geometry); + if (result != VDO_SUCCESS) { + *reason = "Could not load geometry block"; + return result; + } + + result = vdo_make_thread_config(config->thread_counts, + &vdo->thread_config); + if (result != VDO_SUCCESS) { + *reason = "Cannot create thread configuration"; + return result; + } + + uds_log_info("zones: %d logical, %d physical, %d hash; total threads: %d", + config->thread_counts.logical_zones, + config->thread_counts.physical_zones, + config->thread_counts.hash_zones, + vdo->thread_config->thread_count); + + /* Compression context storage */ + result = UDS_ALLOCATE(config->thread_counts.cpu_threads, + char *, + "LZ4 context", + &vdo->compression_context); + if (result != VDO_SUCCESS) { + *reason = "cannot allocate LZ4 context"; + return result; + } + + for (i = 0; i < config->thread_counts.cpu_threads; i++) { + result = UDS_ALLOCATE(LZ4_MEM_COMPRESS, + char, + "LZ4 context", + &vdo->compression_context[i]); + if (result != VDO_SUCCESS) { + *reason = "cannot allocate LZ4 context"; + return result; + } + } + + result = vdo_register(vdo); + if (result != VDO_SUCCESS) { + *reason = "Cannot add VDO to device registry"; + return result; + } + + vdo_set_admin_state_code(&vdo->admin_state, + VDO_ADMIN_STATE_INITIALIZED); + return result; +} + +/** + * vdo_make() - Allocate and initialize a vdo. + * @instance: Device instantiation counter. + * @config: The device configuration. + * @reason: The reason for any failure during this call. + * @vdo_ptr: A pointer to hold the created vdo. + * + * Return: VDO_SUCCESS or an error. + */ +int vdo_make(unsigned int instance, + struct device_config *config, + char **reason, + struct vdo **vdo_ptr) +{ + int result; + struct vdo *vdo; + + /* VDO-3769 - Set a generic reason so we don't ever return garbage. */ + *reason = "Unspecified error"; + + result = UDS_ALLOCATE(1, struct vdo, __func__, &vdo); + if (result != UDS_SUCCESS) { + *reason = "Cannot allocate VDO"; + vdo_release_instance(instance); + return result; + } + + result = initialize_vdo(vdo, config, instance, reason); + if (result != VDO_SUCCESS) { + vdo_destroy(vdo); + return result; + } + + /* From here on, the caller will clean up if there is an error. */ + *vdo_ptr = vdo; + + snprintf(vdo->thread_name_prefix, + sizeof(vdo->thread_name_prefix), + "%s%u", + MODULE_NAME, + instance); + BUG_ON(vdo->thread_name_prefix[0] == '\0'); + result = UDS_ALLOCATE(vdo->thread_config->thread_count, + struct vdo_thread, + __func__, + &vdo->threads); + if (result != VDO_SUCCESS) { + *reason = "Cannot allocate thread structures"; + return result; + } + + result = vdo_make_thread(vdo, + vdo->thread_config->admin_thread, + &default_queue_type, + 1, + NULL); + if (result != VDO_SUCCESS) { + *reason = "Cannot make admin thread"; + return result; + } + + result = vdo_make_flusher(vdo); + if (result != VDO_SUCCESS) { + *reason = "Cannot make flusher zones"; + return result; + } + + result = vdo_make_packer(vdo, DEFAULT_PACKER_BINS, &vdo->packer); + if (result != VDO_SUCCESS) { + *reason = "Cannot make packer zones"; + return result; + } + + result = vdo_make_dedupe_index(vdo, &vdo->dedupe_index); + if (result != UDS_SUCCESS) { + *reason = "Cannot initialize dedupe index"; + return result; + } + + BUG_ON(vdo->device_config->logical_block_size <= 0); + BUG_ON(vdo->device_config->owned_device == NULL); + result = make_data_vio_pool(vdo, + MAXIMUM_VDO_USER_VIOS, + MAXIMUM_VDO_USER_VIOS * 3 / 4, + &vdo->data_vio_pool); + if (result != VDO_SUCCESS) { + *reason = "Cannot allocate data_vio pool"; + return result; + } + + result = vdo_make_io_submitter(config->thread_counts.bio_threads, + config->thread_counts.bio_rotation_interval, + get_data_vio_pool_request_limit(vdo->data_vio_pool), + vdo, + &vdo->io_submitter); + if (result != VDO_SUCCESS) { + *reason = "bio submission initialization failed"; + return result; + } + + if (vdo_uses_bio_ack_queue(vdo)) { + result = vdo_make_thread(vdo, + vdo->thread_config->bio_ack_thread, + &bio_ack_q_type, + config->thread_counts.bio_ack_threads, + NULL); + if (result != VDO_SUCCESS) { + *reason = "bio ack queue initialization failed"; + return result; + } + } + + result = vdo_make_thread(vdo, + vdo->thread_config->cpu_thread, + &cpu_q_type, + config->thread_counts.cpu_threads, + (void **) vdo->compression_context); + if (result != VDO_SUCCESS) { + *reason = "CPU queue initialization failed"; + return result; + } + + return VDO_SUCCESS; +} + static void finish_vdo(struct vdo *vdo) { int i; - finish_work_queue(vdo->cpu_queue); - finish_work_queue(vdo->bio_ack_queue); - cleanup_vdo_io_submitter(vdo->io_submitter); - for (i = 0; i < vdo->initialized_thread_count; i++) { - finish_work_queue(vdo->threads[i].request_queue); + if (vdo->threads == NULL) { + return; } - free_buffer_pool(UDS_FORGET(vdo->data_vio_pool)); - finish_vdo_dedupe_index(vdo->dedupe_index); - free_batch_processor(UDS_FORGET(vdo->data_vio_releaser)); + vdo_cleanup_io_submitter(vdo->io_submitter); + vdo_finish_dedupe_index(vdo->dedupe_index); + + for (i = 0; i < vdo->thread_config->thread_count; i++) { + finish_work_queue(vdo->threads[i].queue); + } } -/**********************************************************************/ -void destroy_vdo(struct vdo *vdo) +/** + * vdo_destroy() - Destroy a vdo instance. + * @vdo: The vdo to destroy (may be NULL). + */ +void vdo_destroy(struct vdo *vdo) { int i; - const struct thread_config *thread_config = vdo->thread_config; + + if (vdo == NULL) { + return; + } + + /* A running VDO should never be destroyed without suspending first. */ + BUG_ON(vdo_get_admin_state(vdo)->normal); vdo->allocations_allowed = true; - // Stop services that need to gather VDO statistics from the worker - // threads. + /* + * Stop services that need to gather VDO statistics from the worker + * threads. + */ if (vdo->sysfs_added) { init_completion(&vdo->stats_shutdown); kobject_put(&vdo->stats_directory); @@ -97,47 +391,30 @@ void destroy_vdo(struct vdo *vdo) } finish_vdo(vdo); - unregister_vdo(vdo); - free_work_queue(UDS_FORGET(vdo->cpu_queue)); - free_work_queue(UDS_FORGET(vdo->bio_ack_queue)); - free_vdo_io_submitter(UDS_FORGET(vdo->io_submitter)); - free_vdo_dedupe_index(UDS_FORGET(vdo->dedupe_index)); - free_vdo_flusher(UDS_FORGET(vdo->flusher)); - free_vdo_packer(UDS_FORGET(vdo->packer)); - free_vdo_recovery_journal(UDS_FORGET(vdo->recovery_journal)); - free_vdo_slab_depot(UDS_FORGET(vdo->depot)); - free_vdo_layout(UDS_FORGET(vdo->layout)); - free_vdo_super_block(UDS_FORGET(vdo->super_block)); - free_vdo_block_map(UDS_FORGET(vdo->block_map)); - - if (vdo->hash_zones != NULL) { - zone_count_t zone; - for (zone = 0; zone < thread_config->hash_zone_count; zone++) { - free_vdo_hash_zone(UDS_FORGET(vdo->hash_zones[zone])); - } - } - UDS_FREE(vdo->hash_zones); - vdo->hash_zones = NULL; - - free_vdo_logical_zones(UDS_FORGET(vdo->logical_zones)); - - if (vdo->physical_zones != NULL) { - zone_count_t zone; - for (zone = 0; zone < thread_config->physical_zone_count; zone++) { - free_vdo_physical_zone(UDS_FORGET(vdo->physical_zones[zone])); + vdo_unregister(vdo); + free_data_vio_pool(vdo->data_vio_pool); + vdo_free_io_submitter(UDS_FORGET(vdo->io_submitter)); + vdo_free_dedupe_index(UDS_FORGET(vdo->dedupe_index)); + vdo_free_flusher(UDS_FORGET(vdo->flusher)); + vdo_free_packer(UDS_FORGET(vdo->packer)); + vdo_free_recovery_journal(UDS_FORGET(vdo->recovery_journal)); + vdo_free_slab_depot(UDS_FORGET(vdo->depot)); + vdo_free_layout(UDS_FORGET(vdo->layout)); + vdo_free_super_block(UDS_FORGET(vdo->super_block)); + vdo_free_block_map(UDS_FORGET(vdo->block_map)); + vdo_free_hash_zones(UDS_FORGET(vdo->hash_zones)); + vdo_free_physical_zones(UDS_FORGET(vdo->physical_zones)); + vdo_free_logical_zones(UDS_FORGET(vdo->logical_zones)); + vdo_free_read_only_notifier(UDS_FORGET(vdo->read_only_notifier)); + + if (vdo->threads != NULL) { + for (i = 0; i < vdo->thread_config->thread_count; i++) { + free_work_queue(UDS_FORGET(vdo->threads[i].queue)); } + UDS_FREE(UDS_FORGET(vdo->threads)); } - UDS_FREE(vdo->physical_zones); - vdo->physical_zones = NULL; - free_vdo_read_only_notifier(UDS_FORGET(vdo->read_only_notifier)); - free_vdo_thread_config(UDS_FORGET(vdo->thread_config)); - - for (i = 0; i < vdo->initialized_thread_count; i++) { - free_work_queue(UDS_FORGET(vdo->threads[i].request_queue)); - } - UDS_FREE(vdo->threads); - vdo->threads = NULL; + vdo_free_thread_config(UDS_FORGET(vdo->thread_config)); if (vdo->compression_context != NULL) { for (i = 0; @@ -149,7 +426,7 @@ void destroy_vdo(struct vdo *vdo) UDS_FREE(UDS_FORGET(vdo->compression_context)); } - release_vdo_instance(vdo->instance); + vdo_release_instance(vdo->instance); /* * The call to kobject_put on the kobj sysfs node will decrement its @@ -164,25 +441,33 @@ void destroy_vdo(struct vdo *vdo) } /** - * Signal that sysfs stats have been shut down. - * - * @param directory The vdo stats directory - **/ + * pool_stats_release() - Signal that sysfs stats have been shut down. + * @directory: The vdo stats directory. + */ static void pool_stats_release(struct kobject *directory) { struct vdo *vdo = container_of(directory, struct vdo, stats_directory); + complete(&vdo->stats_shutdown); } -/**********************************************************************/ -int add_vdo_sysfs_stats_dir(struct vdo *vdo) +ATTRIBUTE_GROUPS(vdo_pool_stats); +static struct kobj_type stats_directory_type = { + .release = pool_stats_release, + .sysfs_ops = &vdo_pool_stats_sysfs_ops, + .default_groups = vdo_pool_stats_groups, +}; + +/** + * vdo_add_sysfs_stats_dir() - Add the stats directory to the vdo sysfs + * directory. + * @vdo: The vdo. + * + * Return: VDO_SUCCESS or an error. + */ +int vdo_add_sysfs_stats_dir(struct vdo *vdo) { int result; - static struct kobj_type stats_directory_type = { - .release = pool_stats_release, - .sysfs_ops = &vdo_pool_stats_sysfs_ops, - .default_attrs = vdo_pool_stats_attrs, - }; kobject_init(&vdo->stats_directory, &stats_directory_type); result = kobject_add(&vdo->stats_directory, @@ -195,21 +480,124 @@ int add_vdo_sysfs_stats_dir(struct vdo *vdo) return VDO_SUCCESS; } -/**********************************************************************/ -struct block_device *get_vdo_backing_device(const struct vdo *vdo) +/** + * vdo_prepare_to_modify() - Prepare to modify a vdo. + * @vdo: The vdo being resumed. + * @config: The new device configuration. + * @may_grow: Set to true if growing the logical and physical size of + * the vdo is currently permitted. + * @error_ptr: A pointer to store the reason for any failure. + * + * This method is called during preresume to prepare for modifications which + * could result if the table has changed. + * + * Return: VDO_SUCCESS or an error. + */ +int vdo_prepare_to_modify(struct vdo *vdo, + struct device_config *config, + bool may_grow, + char **error_ptr) +{ + int result = vdo_validate_new_device_config(config, + vdo->device_config, + may_grow, + error_ptr); + if (result != VDO_SUCCESS) { + return -EINVAL; + } + + if (config->logical_blocks > vdo->device_config->logical_blocks) { + result = vdo_prepare_to_grow_logical(vdo, + config->logical_blocks); + if (result != VDO_SUCCESS) { + *error_ptr = "Device vdo_prepare_to_grow_logical failed"; + return result; + } + } + + if (config->physical_blocks > vdo->device_config->physical_blocks) { + result = vdo_prepare_to_grow_physical(vdo, + config->physical_blocks); + if (result != VDO_SUCCESS) { + if (result == VDO_PARAMETER_MISMATCH) { + /* + * If we don't trap this case, + * vdo_map_to_system_error() will remap it to + * -EIO, which is misleading and ahistorical. + */ + result = -EINVAL; + } + + if (result == VDO_TOO_MANY_SLABS) { + *error_ptr = "Device vdo_prepare_to_grow_physical failed (specified physical size too big based on formatted slab size)"; + } else { + *error_ptr = "Device vdo_prepare_to_grow_physical failed"; + } + return result; + } + } + + if (strcmp(config->parent_device_name, + vdo->device_config->parent_device_name) != 0) { + const char *device_name + = vdo_get_device_name(config->owning_target); + uds_log_info("Updating backing device of %s from %s to %s", + device_name, + vdo->device_config->parent_device_name, + config->parent_device_name); + } + + return VDO_SUCCESS; +} + +/** + * vdo_get_backing_device() - Get the block device object underlying a vdo. + * @vdo: The vdo. + * + * Return: The vdo's current block device. + */ +struct block_device *vdo_get_backing_device(const struct vdo *vdo) { return vdo->device_config->owned_device->bdev; } -/**********************************************************************/ +/** + * vdo_get_device_name() - Get the device name associated with the vdo target. + * @target: The target device interface. + * + * Return: The block device name. + */ +const char *vdo_get_device_name(const struct dm_target *target) +{ + return dm_device_name(dm_table_get_md(target->table)); +} + +/** + * vdo_synchronous_flush() - Issue a flush request and wait for it to + * complete. + * @vdo: The vdo. + * + * Return: VDO_SUCCESS or an error. + */ int vdo_synchronous_flush(struct vdo *vdo) { int result; struct bio bio; +#ifdef RHEL_RELEASE_CODE +#define USE_ALTERNATE (RHEL_RELEASE_CODE < RHEL_RELEASE_VERSION(9,1)) +#else +#define USE_ALTERNATE (LINUX_VERSION_CODE < KERNEL_VERSION(5,18,0)) +#endif + +#if USE_ALTERNATE bio_init(&bio, 0, 0); - bio_set_dev(&bio, get_vdo_backing_device(vdo)); + bio_set_dev(&bio, vdo_get_backing_device(vdo)); bio.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH; +#else + bio_init(&bio, vdo_get_backing_device(vdo), 0, 0, + REQ_OP_WRITE | REQ_PREFLUSH); +#endif submit_bio_wait(&bio); result = blk_status_to_errno(bio.bi_status); @@ -223,222 +611,239 @@ int vdo_synchronous_flush(struct vdo *vdo) return result; } -/**********************************************************************/ -enum vdo_state get_vdo_state(const struct vdo *vdo) +/** + * vdo_get_state() - Get the current state of the vdo. + * @vdo: The vdo. + + * Context: This method may be called from any thread. + * + * Return: The current state of the vdo. + */ +enum vdo_state vdo_get_state(const struct vdo *vdo) { enum vdo_state state = atomic_read(&vdo->state); + smp_rmb(); return state; } -/**********************************************************************/ -void set_vdo_state(struct vdo *vdo, enum vdo_state state) +/** + * vdo_set_state() - Set the current state of the vdo. + * @vdo: The vdo whose state is to be set. + * @state: The new state of the vdo. + * + * Context: This method may be called from any thread. + */ +void vdo_set_state(struct vdo *vdo, enum vdo_state state) { smp_wmb(); atomic_set(&vdo->state, state); } -/**********************************************************************/ -const struct admin_state_code *get_vdo_admin_state(const struct vdo *vdo) +/** + * vdo_get_admin_state() - Get the admin state of the vdo. + * @vdo: The vdo. + * + * Return: The code for the vdo's current admin state. + */ +const struct admin_state_code *vdo_get_admin_state(const struct vdo *vdo) { - return get_vdo_admin_state_code(&vdo->admin_state); + return vdo_get_admin_state_code(&vdo->admin_state); } /** - * Record the state of the VDO for encoding in the super block. - **/ + * record_vdo() - Record the state of the VDO for encoding in the super block. + */ static void record_vdo(struct vdo *vdo) { vdo->states.release_version = vdo->geometry.release_version; - vdo->states.vdo.state = get_vdo_state(vdo); - vdo->states.block_map = record_vdo_block_map(vdo->block_map); + vdo->states.vdo.state = vdo_get_state(vdo); + vdo->states.block_map = vdo_record_block_map(vdo->block_map); vdo->states.recovery_journal = - record_vdo_recovery_journal(vdo->recovery_journal); - vdo->states.slab_depot = record_vdo_slab_depot(vdo->depot); - vdo->states.layout = get_vdo_fixed_layout(vdo->layout); + vdo_record_recovery_journal(vdo->recovery_journal); + vdo->states.slab_depot = vdo_record_slab_depot(vdo->depot); + vdo->states.layout = vdo_get_fixed_layout(vdo->layout); } -/**********************************************************************/ -void save_vdo_components(struct vdo *vdo, struct vdo_completion *parent) +/** + * vdo_save_components() - Encode the vdo and save the super block + * asynchronously. + * @vdo: The vdo whose state is being saved. + * @parent: The completion to notify when the save is complete. + * + * All non-user mode super block savers should use this bottle neck instead of + * calling vdo_save_super_block() directly. + */ +void vdo_save_components(struct vdo *vdo, struct vdo_completion *parent) { int result; struct buffer *buffer - = get_vdo_super_block_codec(vdo->super_block)->component_buffer; + = vdo_get_super_block_codec(vdo->super_block)->component_buffer; record_vdo(vdo); - result = encode_vdo_component_states(buffer, &vdo->states); + result = vdo_encode_component_states(buffer, &vdo->states); if (result != VDO_SUCCESS) { - finish_vdo_completion(parent, result); + vdo_finish_completion(parent, result); return; } - save_vdo_super_block(vdo->super_block, get_vdo_first_block_offset(vdo), + vdo_save_super_block(vdo->super_block, + vdo_get_data_region_start(vdo->geometry), parent); } /** - * Notify a vdo that it is going read-only. This will save the read-only state - * to the super block. + * notify_vdo_of_read_only_mode() - Notify a vdo that it is going read-only. + * @listener: The vdo. + * @parent: The completion to notify in order to acknowledge the notification. * - *

    Implements vdo_read_only_notification. + * This will save the read-only state to the super block. * - * @param listener The vdo - * @param parent The completion to notify in order to acknowledge the - * notification - **/ + * Implements vdo_read_only_notification. + */ static void notify_vdo_of_read_only_mode(void *listener, struct vdo_completion *parent) { struct vdo *vdo = listener; - if (in_read_only_mode(vdo)) { - complete_vdo_completion(parent); + + if (vdo_in_read_only_mode(vdo)) { + vdo_complete_completion(parent); } - set_vdo_state(vdo, VDO_READ_ONLY_MODE); - save_vdo_components(vdo, parent); + vdo_set_state(vdo, VDO_READ_ONLY_MODE); + vdo_save_components(vdo, parent); } -/**********************************************************************/ -int enable_read_only_entry(struct vdo *vdo) +/** + * vdo_enable_read_only_entry() - Enable a vdo to enter read-only mode on + * errors. + * @vdo: The vdo to enable. + * + * Return: VDO_SUCCESS or an error. + */ +int vdo_enable_read_only_entry(struct vdo *vdo) { - return register_vdo_read_only_listener(vdo->read_only_notifier, + return vdo_register_read_only_listener(vdo->read_only_notifier, vdo, notify_vdo_of_read_only_mode, vdo->thread_config->admin_thread); } -/**********************************************************************/ -bool in_read_only_mode(const struct vdo *vdo) -{ - return (get_vdo_state(vdo) == VDO_READ_ONLY_MODE); -} - -/**********************************************************************/ -bool vdo_was_new(const struct vdo *vdo) -{ - return (vdo->load_state == VDO_NEW); -} - -/**********************************************************************/ -bool requires_read_only_rebuild(const struct vdo *vdo) -{ - return ((vdo->load_state == VDO_FORCE_REBUILD) || - (vdo->load_state == VDO_REBUILD_FOR_UPGRADE)); -} - -/**********************************************************************/ -bool requires_rebuild(const struct vdo *vdo) -{ - switch (get_vdo_state(vdo)) { - case VDO_DIRTY: - case VDO_FORCE_REBUILD: - case VDO_REPLAYING: - case VDO_REBUILD_FOR_UPGRADE: - return true; - - default: - return false; - } -} - -/**********************************************************************/ -bool requires_recovery(const struct vdo *vdo) -{ - return ((vdo->load_state == VDO_DIRTY) || - (vdo->load_state == VDO_REPLAYING) || - (vdo->load_state == VDO_RECOVERING)); -} - -/**********************************************************************/ -bool is_replaying(const struct vdo *vdo) +/** + * vdo_in_read_only_mode() - Check whether a vdo is in read-only mode. + * @vdo: The vdo to query. + * + * Return: true if the vdo is in read-only mode. + */ +bool vdo_in_read_only_mode(const struct vdo *vdo) { - return (get_vdo_state(vdo) == VDO_REPLAYING); + return (vdo_get_state(vdo) == VDO_READ_ONLY_MODE); } -/**********************************************************************/ -bool in_recovery_mode(const struct vdo *vdo) +/** + * vdo_in_recovery_mode() - Check whether the vdo is in recovery mode. + * @vdo: The vdo to query. + * + * Return: true if the vdo is in recovery mode. + */ +bool vdo_in_recovery_mode(const struct vdo *vdo) { - return (get_vdo_state(vdo) == VDO_RECOVERING); + return (vdo_get_state(vdo) == VDO_RECOVERING); } -/**********************************************************************/ -void enter_recovery_mode(struct vdo *vdo) +/** + * vdo_enter_recovery_mode() - Put the vdo into recovery mode. + * @vdo: The vdo. + */ +void vdo_enter_recovery_mode(struct vdo *vdo) { - assert_on_admin_thread(vdo, __func__); + vdo_assert_on_admin_thread(vdo, __func__); - if (in_read_only_mode(vdo)) { + if (vdo_in_read_only_mode(vdo)) { return; } uds_log_info("Entering recovery mode"); - set_vdo_state(vdo, VDO_RECOVERING); + vdo_set_state(vdo, VDO_RECOVERING); } /** - * Callback to turn compression on or off. - * - * @param completion The completion - **/ + * set_compression_callback() - Callback to turn compression on or off. + * @completion: The completion. + */ static void set_compression_callback(struct vdo_completion *completion) { struct vdo *vdo = completion->vdo; bool *enable = completion->parent; - bool was_enabled = get_vdo_compressing(vdo); + bool was_enabled = vdo_get_compressing(vdo); if (*enable != was_enabled) { WRITE_ONCE(vdo->compressing, *enable); if (was_enabled) { - // Signal the packer to flush since compression has - // been disabled. - flush_vdo_packer(vdo->packer); + /* + * Signal the packer to flush since compression has + * been disabled. + */ + vdo_flush_packer(vdo->packer); } } uds_log_info("compression is %s", (*enable ? "enabled" : "disabled")); *enable = was_enabled; - complete_vdo_completion(completion); + vdo_complete_completion(completion); } -/**********************************************************************/ -bool set_vdo_compressing(struct vdo *vdo, bool enable) +/** + * vdo_set_compressing() - Turn compression on or off. + * @vdo: The vdo. + * @enable: Whether to enable or disable compression. + * + * Return: Whether compression was previously on or off. + */ +bool vdo_set_compressing(struct vdo *vdo, bool enable) { - perform_synchronous_vdo_action(vdo, + vdo_perform_synchronous_action(vdo, set_compression_callback, vdo->thread_config->packer_thread, &enable); return enable; } -/**********************************************************************/ -bool get_vdo_compressing(struct vdo *vdo) +/** + * vdo_get_compressing() - Get whether compression is enabled in a vdo. + * @vdo: The vdo. + * + * Return: State of compression. + */ +bool vdo_get_compressing(struct vdo *vdo) { return READ_ONCE(vdo->compressing); } -/**********************************************************************/ static size_t get_block_map_cache_size(const struct vdo *vdo) { return ((size_t) vdo->device_config->cache_size) * VDO_BLOCK_SIZE; } /** - * Tally the hash lock statistics from all the hash zones. + * get_hash_lock_statistics() - Tally the hash lock statistics from all the + * hash zones. + * @hash_zones: The hash zones to query * - * @param vdo The vdo to query - * - * @return The sum of the hash lock statistics from all hash zones - **/ + * Return: The sum of the hash lock statistics from all hash zones. + */ static struct hash_lock_statistics -get_hash_lock_statistics(const struct vdo *vdo) +get_hash_lock_statistics(const struct hash_zones *zones) { - zone_count_t zone_count = vdo->thread_config->hash_zone_count; zone_count_t zone; struct hash_lock_statistics totals; + memset(&totals, 0, sizeof(totals)); - for (zone = 0; zone < zone_count; zone++) { + for (zone = 0; zone < zones->zone_count; zone++) { struct hash_lock_statistics stats = - get_vdo_hash_zone_statistics(vdo->hash_zones[zone]); + vdo_get_hash_zone_statistics(&zones->zones[zone]); + totals.dedupe_advice_valid += stats.dedupe_advice_valid; totals.dedupe_advice_stale += stats.dedupe_advice_stale; totals.concurrent_data_matches += @@ -450,7 +855,6 @@ get_hash_lock_statistics(const struct vdo *vdo) return totals; } -/**********************************************************************/ static struct error_statistics __must_check get_vdo_error_statistics(const struct vdo *vdo) { @@ -461,6 +865,7 @@ get_vdo_error_statistics(const struct vdo *vdo) * sufficient. */ const struct atomic_statistics *atoms = &vdo->stats; + return (struct error_statistics) { .invalid_advice_pbn_count = atomic64_read(&atoms->invalid_advice_pbn_count), @@ -471,7 +876,6 @@ get_vdo_error_statistics(const struct vdo *vdo) }; } -/**********************************************************************/ static void copy_bio_stat(struct bio_stats *b, const struct atomic_bio_stats *a) { @@ -483,7 +887,6 @@ static void copy_bio_stat(struct bio_stats *b, b->fua = atomic64_read(&a->fua); } -/**********************************************************************/ static struct bio_stats subtract_bio_stats(struct bio_stats minuend, struct bio_stats subtrahend) { @@ -497,67 +900,120 @@ static struct bio_stats subtract_bio_stats(struct bio_stats minuend, }; } +/** + * vdo_get_physical_blocks_allocated() - Get the number of physical blocks in + * use by user data. + * @vdo: The vdo. + * + * Return: The number of blocks allocated for user data. + */ +static block_count_t __must_check +vdo_get_physical_blocks_allocated(const struct vdo *vdo) +{ + return (vdo_get_slab_depot_allocated_blocks(vdo->depot) - + vdo_get_journal_block_map_data_blocks_used(vdo->recovery_journal)); +} /** - * Populate a vdo_statistics structure on the admin thread. + * vdo_get_physical_blocks_overhead() - Get the number of physical blocks used + * by vdo metadata. + * @vdo: The vdo. * - * @param vdo The vdo - * @param stats The statistics structure to populate - **/ + * Return: The number of overhead blocks. + */ +static block_count_t __must_check +vdo_get_physical_blocks_overhead(const struct vdo *vdo) +{ + /* + * XXX config.physical_blocks is actually mutated during resize and is in + * a packed structure, but resize runs on admin thread so we're usually + * OK. + */ + return (vdo->states.vdo.config.physical_blocks - + vdo_get_slab_depot_data_blocks(vdo->depot) + + vdo_get_journal_block_map_data_blocks_used(vdo->recovery_journal)); +} + +static const char *vdo_describe_state(enum vdo_state state) +{ + /* These strings should all fit in the 15 chars of VDOStatistics.mode. */ + switch (state) { + case VDO_RECOVERING: + return "recovering"; + + case VDO_READ_ONLY_MODE: + return "read-only"; + + default: + return "normal"; + } +} + +/** + * get_vdo_statistics() - Populate a vdo_statistics structure on the admin + * thread. + * @vdo: The vdo. + * @stats: The statistics structure to populate. + */ static void get_vdo_statistics(const struct vdo *vdo, struct vdo_statistics *stats) { struct recovery_journal *journal = vdo->recovery_journal; - enum vdo_state state = get_vdo_state(vdo); + enum vdo_state state = vdo_get_state(vdo); - assert_on_admin_thread(vdo, __func__); + vdo_assert_on_admin_thread(vdo, __func__); - // start with a clean slate + /* start with a clean slate */ memset(stats, 0, sizeof(struct vdo_statistics)); - // These are immutable properties of the vdo object, so it is safe to - // query them from any thread. + /* + * These are immutable properties of the vdo object, so it is safe to + * query them from any thread. + */ stats->version = STATISTICS_VERSION; stats->release_version = VDO_CURRENT_RELEASE_VERSION_NUMBER; stats->logical_blocks = vdo->states.vdo.config.logical_blocks; - // XXX config.physical_blocks is actually mutated during resize and is - // in a packed structure, but resize runs on the admin thread so we're - // usually OK. + /* + * XXX config.physical_blocks is actually mutated during resize and is + * in a packed structure, but resize runs on the admin thread so we're + * usually OK. + */ stats->physical_blocks = vdo->states.vdo.config.physical_blocks; stats->block_size = VDO_BLOCK_SIZE; stats->complete_recoveries = vdo->states.vdo.complete_recoveries; stats->read_only_recoveries = vdo->states.vdo.read_only_recoveries; stats->block_map_cache_size = get_block_map_cache_size(vdo); - // The callees are responsible for thread-safety. - stats->data_blocks_used = get_vdo_physical_blocks_allocated(vdo); - stats->overhead_blocks_used = get_vdo_physical_blocks_overhead(vdo); + /* The callees are responsible for thread-safety. */ + stats->data_blocks_used = vdo_get_physical_blocks_allocated(vdo); + stats->overhead_blocks_used = vdo_get_physical_blocks_overhead(vdo); stats->logical_blocks_used = - get_vdo_recovery_journal_logical_blocks_used(journal); - get_vdo_slab_depot_statistics(vdo->depot, stats); - stats->journal = get_vdo_recovery_journal_statistics(journal); - stats->packer = get_vdo_packer_statistics(vdo->packer); - stats->block_map = get_vdo_block_map_statistics(vdo->block_map); - stats->hash_lock = get_hash_lock_statistics(vdo); + vdo_get_recovery_journal_logical_blocks_used(journal); + vdo_get_slab_depot_statistics(vdo->depot, stats); + stats->journal = vdo_get_recovery_journal_statistics(journal); + stats->packer = vdo_get_packer_statistics(vdo->packer); + stats->block_map = vdo_get_block_map_statistics(vdo->block_map); + stats->hash_lock = get_hash_lock_statistics(vdo->hash_zones); stats->errors = get_vdo_error_statistics(vdo); stats->in_recovery_mode = (state == VDO_RECOVERING); snprintf(stats->mode, sizeof(stats->mode), "%s", - describe_vdo_state(state)); - stats->version = STATISTICS_VERSION; - stats->release_version = VDO_CURRENT_RELEASE_VERSION_NUMBER; - stats->instance = vdo->instance; + vdo_describe_state(state)); + stats->instance = vdo->instance; stats->current_vios_in_progress = - READ_ONCE(vdo->request_limiter.active); - stats->max_vios = READ_ONCE(vdo->request_limiter.maximum); + get_data_vio_pool_active_requests(vdo->data_vio_pool); + stats->max_vios = + get_data_vio_pool_maximum_requests(vdo->data_vio_pool); - // get_vdo_dedupe_index_timeout_count() gives the number of timeouts, - // and dedupe_context_busy gives the number of queries not made because - // of earlier timeouts. + /* + * vdo_get_dedupe_index_timeout_count() gives the number of timeouts, + * and dedupe_context_busy gives the number of queries not made because + * of earlier timeouts. + */ stats->dedupe_advice_timeouts = - (get_vdo_dedupe_index_timeout_count(vdo->dedupe_index) + + (vdo_get_dedupe_index_timeout_count(vdo->dedupe_index) + atomic64_read(&vdo->stats.dedupe_context_busy)); stats->flush_out = atomic64_read(&vdo->stats.flush_out); stats->logical_block_size = @@ -583,123 +1039,100 @@ static void get_vdo_statistics(const struct vdo *vdo, subtract_bio_stats(stats->bios_in, stats->bios_acknowledged); get_uds_memory_stats(&stats->memory_usage.bytes_used, &stats->memory_usage.peak_bytes_used); - get_vdo_dedupe_index_statistics(vdo->dedupe_index, &stats->index); + vdo_get_dedupe_index_statistics(vdo->dedupe_index, &stats->index); } /** - * Action to populate a vdo_statistics structure on the admin thread; - * registered in fetch_vdo_statistics(). + * vdo_fetch_statistics_callback() - Action to populate a vdo_statistics + * structure on the admin thread. + * @completion: The completion. * - * @param completion The completion - **/ -static void fetch_vdo_statistics_callback(struct vdo_completion *completion) + * This callback is registered in vdo_fetch_statistics(). + */ +static void vdo_fetch_statistics_callback(struct vdo_completion *completion) { get_vdo_statistics(completion->vdo, completion->parent); - complete_vdo_completion(completion); + vdo_complete_completion(completion); } -/***********************************************************************/ -void fetch_vdo_statistics(struct vdo *vdo, struct vdo_statistics *stats) +/** + * vdo_fetch_statistics() - Fetch statistics on the correct thread. + * @vdo: The vdo. + * @stats: The vdo statistics are returned here. + */ +void vdo_fetch_statistics(struct vdo *vdo, struct vdo_statistics *stats) { - perform_synchronous_vdo_action(vdo, - fetch_vdo_statistics_callback, + vdo_perform_synchronous_action(vdo, + vdo_fetch_statistics_callback, vdo->thread_config->admin_thread, stats); } -/**********************************************************************/ -block_count_t get_vdo_physical_blocks_allocated(const struct vdo *vdo) -{ - return (get_vdo_slab_depot_allocated_blocks(vdo->depot) - - vdo_get_journal_block_map_data_blocks_used(vdo->recovery_journal)); -} - -/**********************************************************************/ -block_count_t get_vdo_physical_blocks_free(const struct vdo *vdo) +/** + * vdo_get_callback_thread_id() - Get the id of the callback thread on which a + * completion is currently running. + * + * Return: The current thread ID, or -1 if no such thread. + */ +thread_id_t vdo_get_callback_thread_id(void) { - return get_vdo_slab_depot_free_blocks(vdo->depot); -} + struct vdo_work_queue *queue = get_current_work_queue(); + struct vdo_thread *thread; + thread_id_t thread_id; -/**********************************************************************/ -block_count_t get_vdo_physical_blocks_overhead(const struct vdo *vdo) -{ - // XXX config.physical_blocks is actually mutated during resize and is in - // a packed structure, but resize runs on admin thread so we're usually - // OK. - return (vdo->states.vdo.config.physical_blocks - - get_vdo_slab_depot_data_blocks(vdo->depot) + - vdo_get_journal_block_map_data_blocks_used(vdo->recovery_journal)); -} - -/**********************************************************************/ -const struct thread_config *get_vdo_thread_config(const struct vdo *vdo) -{ - return vdo->thread_config; -} - -/**********************************************************************/ -block_count_t get_vdo_configured_block_map_maximum_age(const struct vdo *vdo) -{ - return vdo->device_config->block_map_maximum_age; -} + if (queue == NULL) { + return VDO_INVALID_THREAD_ID; + } -/**********************************************************************/ -page_count_t get_vdo_configured_cache_size(const struct vdo *vdo) -{ - return vdo->device_config->cache_size; -} + thread = get_work_queue_owner(queue); + thread_id = thread->thread_id; -/**********************************************************************/ -physical_block_number_t get_vdo_first_block_offset(const struct vdo *vdo) -{ - return vdo_get_data_region_start(vdo->geometry); -} + if (PARANOID_THREAD_CONSISTENCY_CHECKS) { + struct vdo *vdo = thread->vdo; -/**********************************************************************/ -struct block_map *get_block_map(const struct vdo *vdo) -{ - return vdo->block_map; -} - -/**********************************************************************/ -struct slab_depot *get_slab_depot(struct vdo *vdo) -{ - return vdo->depot; -} + BUG_ON(thread_id >= vdo->thread_config->thread_count); + BUG_ON(thread != &vdo->threads[thread_id]); + } -/**********************************************************************/ -struct recovery_journal *get_recovery_journal(struct vdo *vdo) -{ - return vdo->recovery_journal; + return thread_id; } -/**********************************************************************/ -void dump_vdo_status(const struct vdo *vdo) +/** + * vdo_dump_status() - Dump status information about a vdo to the log for + * debugging. + * @vdo: The vdo to dump. + */ +void vdo_dump_status(const struct vdo *vdo) { const struct thread_config *thread_config = vdo->thread_config; zone_count_t zone; - dump_vdo_flusher(vdo->flusher); - dump_vdo_recovery_journal_statistics(vdo->recovery_journal); - dump_vdo_packer(vdo->packer); - dump_vdo_slab_depot(vdo->depot); + vdo_dump_flusher(vdo->flusher); + vdo_dump_recovery_journal_statistics(vdo->recovery_journal); + vdo_dump_packer(vdo->packer); + vdo_dump_slab_depot(vdo->depot); for (zone = 0; zone < thread_config->logical_zone_count; zone++) { - dump_vdo_logical_zone(get_vdo_logical_zone(vdo->logical_zones, - zone)); + vdo_dump_logical_zone(&vdo->logical_zones->zones[zone]); } for (zone = 0; zone < thread_config->physical_zone_count; zone++) { - dump_vdo_physical_zone(vdo->physical_zones[zone]); + vdo_dump_physical_zone(&vdo->physical_zones->zones[zone]); } for (zone = 0; zone < thread_config->hash_zone_count; zone++) { - dump_vdo_hash_zone(vdo->hash_zones[zone]); + vdo_dump_hash_zone(&vdo->hash_zones->zones[zone]); } } -/**********************************************************************/ -void assert_on_admin_thread(const struct vdo *vdo, const char *name) +/** + * vdo_assert_on_admin_thread() - Assert that we are running on the admin + * thread. + * @vdo: The vdo. + * @name: The name of the function which should be running on the admin + * thread (for logging). + */ +void vdo_assert_on_admin_thread(const struct vdo *vdo, const char *name) { ASSERT_LOG_ONLY((vdo_get_callback_thread_id() == vdo->thread_config->admin_thread), @@ -707,10 +1140,16 @@ void assert_on_admin_thread(const struct vdo *vdo, const char *name) name); } -/**********************************************************************/ -void assert_on_logical_zone_thread(const struct vdo *vdo, - zone_count_t logical_zone, - const char *name) +/** + * vdo_assert_on_logical_zone_thread() - Assert that this function was called + * on the specified logical zone thread. + * @vdo: The vdo. + * @logical_zone: The number of the logical zone. + * @name: The name of the calling function. + */ +void vdo_assert_on_logical_zone_thread(const struct vdo *vdo, + zone_count_t logical_zone, + const char *name) { ASSERT_LOG_ONLY((vdo_get_callback_thread_id() == vdo_get_logical_zone_thread(vdo->thread_config, @@ -719,10 +1158,17 @@ void assert_on_logical_zone_thread(const struct vdo *vdo, name); } -/**********************************************************************/ -void assert_on_physical_zone_thread(const struct vdo *vdo, - zone_count_t physical_zone, - const char *name) +/** + * vdo_assert_on_physical_zone_thread() - Assert that this function was called + * on the specified physical zone + * thread. + * @vdo: The vdo. + * @physical_zone: The number of the physical zone. + * @name: The name of the calling function. + */ +void vdo_assert_on_physical_zone_thread(const struct vdo *vdo, + zone_count_t physical_zone, + const char *name) { ASSERT_LOG_ONLY((vdo_get_callback_thread_id() == vdo_get_physical_zone_thread(vdo->thread_config, @@ -731,18 +1177,34 @@ void assert_on_physical_zone_thread(const struct vdo *vdo, name); } -/**********************************************************************/ -struct hash_zone *select_hash_zone(const struct vdo *vdo, - const struct uds_chunk_name *name) +void assert_on_vdo_cpu_thread(const struct vdo *vdo, const char *name) +{ + ASSERT_LOG_ONLY((vdo_get_callback_thread_id() == + vdo->thread_config->cpu_thread), + "%s called on cpu thread", + name); +} + +/** + * vdo_select_hash_zone() - Select the hash zone responsible for locking a + * given chunk name. + * @vdo: The vdo containing the hash zones. + * @name: The chunk name. + * + * Return: The hash zone responsible for the chunk name. + */ +struct hash_zone *vdo_select_hash_zone(const struct vdo *vdo, + const struct uds_chunk_name *name) { /* * Use a fragment of the chunk name as a hash code. To ensure uniform * distributions, it must not overlap with fragments used elsewhere. * Eight bits of hash should suffice since the number of hash zones is * small. + * + * XXX Make a central repository for these offsets ala hashUtils. + * XXX Verify that the first byte is independent enough. */ - // XXX Make a central repository for these offsets ala hashUtils. - // XXX Verify that the first byte is independent enough. uint32_t hash = name->name[0]; /* @@ -752,14 +1214,30 @@ struct hash_zone *select_hash_zone(const struct vdo *vdo, * should be uniformly distributed over [0 .. count-1]. The multiply and * shift is much faster than a divide (modulus) on X86 CPUs. */ - return vdo->hash_zones[(hash * vdo->thread_config->hash_zone_count) - >> 8]; + hash = (hash * vdo->thread_config->hash_zone_count) >> 8; + return &vdo->hash_zones->zones[hash]; } -/**********************************************************************/ -int get_physical_zone(const struct vdo *vdo, - physical_block_number_t pbn, - struct physical_zone **zone_ptr) +/** + * vdo_get_physical_zone() - Get the physical zone responsible for a given + * physical block number. + * @vdo: The vdo containing the physical zones. + * @pbn: The PBN of the data block. + * @zone_ptr: A pointer to return the physical zone. + * + * Gets the physical zone responsible for a given physical block number of a + * data block in this vdo instance, or of the zero block (for which a NULL + * zone is returned). For any other block number that is not in the range of + * valid data block numbers in any slab, an error will be returned. This + * function is safe to call on invalid block numbers; it will not put the vdo + * into read-only mode. + * + * Return: VDO_SUCCESS or VDO_OUT_OF_RANGE if the block number is invalid + * or an error code for any other failure. + */ +int vdo_get_physical_zone(const struct vdo *vdo, + physical_block_number_t pbn, + struct physical_zone **zone_ptr) { struct vdo_slab *slab; int result; @@ -769,62 +1247,43 @@ int get_physical_zone(const struct vdo *vdo, return VDO_SUCCESS; } - // Used because it does a more restrictive bounds check than - // get_vdo_slab(), and done first because it won't trigger read-only - // mode on an invalid PBN. + /* + * Used because it does a more restrictive bounds check than + * vdo_get_slab(), and done first because it won't trigger read-only + * mode on an invalid PBN. + */ if (!vdo_is_physical_data_block(vdo->depot, pbn)) { return VDO_OUT_OF_RANGE; } - // With the PBN already checked, we should always succeed in finding a - // slab. - slab = get_vdo_slab(vdo->depot, pbn); + /* + * With the PBN already checked, we should always succeed in finding a + * slab. + */ + slab = vdo_get_slab(vdo->depot, pbn); result = - ASSERT(slab != NULL, "get_vdo_slab must succeed on all valid PBNs"); + ASSERT(slab != NULL, "vdo_get_slab must succeed on all valid PBNs"); if (result != VDO_SUCCESS) { return result; } - *zone_ptr = vdo->physical_zones[get_vdo_slab_zone_number(slab)]; + *zone_ptr = + &vdo->physical_zones->zones[vdo_get_slab_zone_number(slab)]; return VDO_SUCCESS; } -/**********************************************************************/ -struct zoned_pbn -vdo_validate_dedupe_advice(struct vdo *vdo, - const struct data_location *advice, - logical_block_number_t lbn) +/** + * vdo_get_bio_zone() - Get the bio queue zone for submitting I/O to a given + * physical block number. + * @vdo: The vdo to query. + * @pbn: The physical block number of the I/O to be sent. + * + * Return: The bio queue zone number for submitting I/O to the specified pbn. + */ +zone_count_t +vdo_get_bio_zone(const struct vdo *vdo, physical_block_number_t pbn) { - struct zoned_pbn no_advice = { .pbn = VDO_ZERO_BLOCK }; - struct physical_zone *zone; - int result; - - if (advice == NULL) { - return no_advice; - } - - // Don't use advice that's clearly meaningless. - if ((advice->state == VDO_MAPPING_STATE_UNMAPPED) || - (advice->pbn == VDO_ZERO_BLOCK)) { - uds_log_debug("Invalid advice from deduplication server: pbn %llu, state %u. Giving up on deduplication of logical block %llu", - (unsigned long long) advice->pbn, advice->state, - (unsigned long long) lbn); - atomic64_inc(&vdo->stats.invalid_advice_pbn_count); - return no_advice; - } - - result = get_physical_zone(vdo, advice->pbn, &zone); - if ((result != VDO_SUCCESS) || (zone == NULL)) { - uds_log_debug("Invalid physical block number from deduplication server: %llu, giving up on deduplication of logical block %llu", - (unsigned long long) advice->pbn, - (unsigned long long) lbn); - atomic64_inc(&vdo->stats.invalid_advice_pbn_count); - return no_advice; - } - - return (struct zoned_pbn) { - .pbn = advice->pbn, - .state = advice->state, - .zone = zone, - }; + return ((pbn + / vdo->device_config->thread_counts.bio_rotation_interval) + % vdo->thread_config->bio_thread_count); } diff --git a/vdo/vdo.h b/vdo/vdo.h index 45d79d2d..09a31625 100644 --- a/vdo/vdo.h +++ b/vdo/vdo.h @@ -1,214 +1,280 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/vdo.h#49 $ */ #ifndef VDO_H #define VDO_H +#include #include +#include +#include +#include +#include "thread-registry.h" + +#include "admin-completion.h" +#include "admin-state.h" +#include "atomic-stats.h" +#include "data-vio-pool.h" +#include "device-config.h" +#include "header.h" +#include "packer.h" +#include "statistics.h" +#include "super-block.h" +#include "read-only-notifier.h" +#include "thread-config.h" #include "types.h" +#include "uds.h" +#include "vdo-component.h" +#include "vdo-component-states.h" +#include "vdo-layout.h" +#include "volume-geometry.h" -/** - * Destroy a vdo instance. - * - * @param vdo The vdo to destroy - **/ -void destroy_vdo(struct vdo *vdo); -/** - * Add the stats directory to the vdo sysfs directory. - * - * @param vdo The vdo - * - * @return VDO_SUCCESS or an error - **/ -int __must_check add_vdo_sysfs_stats_dir(struct vdo *vdo); +struct vdo_thread { + struct vdo *vdo; + thread_id_t thread_id; + struct vdo_work_queue *queue; + struct registered_thread allocating_thread; +}; + +struct vdo { + char thread_name_prefix[MAX_VDO_WORK_QUEUE_NAME_LEN]; + struct vdo_thread *threads; + vdo_action *action; + struct vdo_completion *completion; + struct vio_tracer *vio_tracer; + + /* The connection to the UDS index */ + struct dedupe_index *dedupe_index; + + /* The atomic version of the state of this vdo */ + atomic_t state; + /* The full state of all components */ + struct vdo_component_states states; + /* + * A counter value to attach to thread names and log messages to + * identify the individual device. + */ + unsigned int instance; + /* The read-only notifier */ + struct read_only_notifier *read_only_notifier; + /* The load-time configuration of this vdo */ + struct device_config *device_config; + /* The thread mapping */ + struct thread_config *thread_config; + + /* The super block */ + struct vdo_super_block *super_block; + + /* Our partitioning of the physical layer's storage */ + struct vdo_layout *layout; + + /* The block map */ + struct block_map *block_map; + + /* The journal for block map recovery */ + struct recovery_journal *recovery_journal; + + /* The slab depot */ + struct slab_depot *depot; + + /* The compressed-block packer */ + struct packer *packer; + /* Whether incoming data should be compressed */ + bool compressing; + + /* The handler for flush requests */ + struct flusher *flusher; + + /* The state the vdo was in when loaded (primarily for unit tests) */ + enum vdo_state load_state; + + /* The logical zones of this vdo */ + struct logical_zones *logical_zones; + + /* The physical zones of this vdo */ + struct physical_zones *physical_zones; + + /* The hash lock zones of this vdo */ + struct hash_zones *hash_zones; + + /* + * Bio submission manager used for sending bios to the storage + * device. + */ + struct io_submitter *io_submitter; + + /* The pool of data_vios for servicing incoming bios */ + struct data_vio_pool *data_vio_pool; + + /* The completion for administrative operations */ + struct admin_completion admin_completion; + + /* The administrative state of the vdo */ + struct admin_state admin_state; + + /* Flags controlling administrative operations */ + const struct admin_state_code *suspend_type; + bool allocations_allowed; + bool dump_on_shutdown; + atomic_t processing_message; + + /* + * Statistics + * Atomic stats counters + */ + struct atomic_statistics stats; + /* Used to gather statistics without allocating memory */ + struct vdo_statistics stats_buffer; + /* Protects the stats_buffer */ + struct mutex stats_mutex; + /* true if sysfs directory is set up */ + bool sysfs_added; + /* Used when shutting down the sysfs statistics */ + struct completion stats_shutdown; + + + /* A list of all device_configs referencing this vdo */ + struct list_head device_config_list; + + /* This VDO's list entry for the device registry */ + struct list_head registration; + + /* Underlying block device info. */ + uint64_t starting_sector_offset; + struct volume_geometry geometry; + + /* For sysfs */ + struct kobject vdo_directory; + struct kobject stats_directory; + + /* N blobs of context data for LZ4 code, one per CPU thread. */ + char **compression_context; +}; -/** - * Get the block device object underlying a vdo. - * - * @param vdo The vdo - * - * @return The vdo's current block device - **/ -struct block_device * __must_check -get_vdo_backing_device(const struct vdo *vdo); /** - * Issue a flush request and wait for it to complete. + * vdo_uses_bio_ack_queue() - Indicate whether the vdo is configured to use a + * separate work queue for acknowledging received + * and processed bios. + * @vdo: The vdo. * - * @param vdo The vdo + * Note that this directly controls the handling of write operations, but the + * compile-time flag VDO_USE_BIO_ACK_QUEUE_FOR_READ is also checked for read + * operations. * - * @return VDO_SUCCESS or an error + * Return: Whether a bio-acknowledgement work queue is in use. */ +static inline bool vdo_uses_bio_ack_queue(struct vdo *vdo) +{ + return vdo->device_config->thread_counts.bio_ack_threads > 0; +} + +int __must_check +vdo_make_thread(struct vdo *vdo, + thread_id_t thread_id, + const struct vdo_work_queue_type *type, + unsigned int queue_count, + void *contexts[]); + +static inline int __must_check +vdo_make_default_thread(struct vdo *vdo, thread_id_t thread_id) +{ + return vdo_make_thread(vdo, thread_id, NULL, 1, NULL); +} + +int __must_check +vdo_make(unsigned int instance, + struct device_config *config, + char **reason, + struct vdo **vdo_ptr); + +void vdo_destroy(struct vdo *vdo); + +int __must_check vdo_add_sysfs_stats_dir(struct vdo *vdo); + +int __must_check +vdo_prepare_to_modify(struct vdo *vdo, + struct device_config *config, + bool may_grow, + char **error_ptr); + +struct block_device * __must_check +vdo_get_backing_device(const struct vdo *vdo); + +const char * __must_check +vdo_get_device_name(const struct dm_target *target); + int __must_check vdo_synchronous_flush(struct vdo *vdo); -/** - * Get the admin state of the vdo. - * - * @param vdo The vdo - * - * @return The code for the vdo's current admin state - **/ const struct admin_state_code * __must_check -get_vdo_admin_state(const struct vdo *vdo); +vdo_get_admin_state(const struct vdo *vdo); -/** - * Turn compression on or off. - * - * @param vdo The vdo - * @param enable Whether to enable or disable compression - * - * @return Whether compression was previously on or off - **/ -bool set_vdo_compressing(struct vdo *vdo, bool enable); +bool vdo_set_compressing(struct vdo *vdo, bool enable); -/** - * Get whether compression is enabled in a vdo. - * - * @param vdo The vdo - * - * @return State of compression - **/ -bool get_vdo_compressing(struct vdo *vdo); +bool vdo_get_compressing(struct vdo *vdo); -/** - * Fetch statistics on the correct thread. - * - * @param [in] vdo The vdo - * @param [out] stats The vdo statistics are returned here - **/ -void fetch_vdo_statistics(struct vdo *vdo, struct vdo_statistics *stats); +void vdo_fetch_statistics(struct vdo *vdo, struct vdo_statistics *stats); -/** - * Get the number of physical blocks in use by user data. - * - * @param vdo The vdo - * - * @return The number of blocks allocated for user data - **/ -block_count_t __must_check -get_vdo_physical_blocks_allocated(const struct vdo *vdo); +thread_id_t vdo_get_callback_thread_id(void); -/** - * Get the number of unallocated physical blocks. - * - * @param vdo The vdo - * - * @return The number of free blocks - **/ -block_count_t __must_check get_vdo_physical_blocks_free(const struct vdo *vdo); +enum vdo_state __must_check vdo_get_state(const struct vdo *vdo); -/** - * Get the number of physical blocks used by vdo metadata. - * - * @param vdo The vdo - * - * @return The number of overhead blocks - **/ -block_count_t __must_check get_vdo_physical_blocks_overhead(const struct vdo *vdo); +void vdo_set_state(struct vdo *vdo, enum vdo_state state); -/** - * Get the thread config of the vdo. - * - * @param vdo The vdo - * - * @return The thread config - **/ -const struct thread_config * __must_check -get_vdo_thread_config(const struct vdo *vdo); +void vdo_save_components(struct vdo *vdo, struct vdo_completion *parent); -/** - * Get the id of the callback thread on which a completion is currently - * running, or -1 if no such thread. - * - * @return the current thread ID - **/ -thread_id_t vdo_get_callback_thread_id(void); +int vdo_enable_read_only_entry(struct vdo *vdo); -/** - * Get the configured maximum age of a dirty block map page. - * - * @param vdo The vdo - * - * @return The block map era length - **/ -block_count_t __must_check -get_vdo_configured_block_map_maximum_age(const struct vdo *vdo); +bool __must_check vdo_in_read_only_mode(const struct vdo *vdo); -/** - * Get the configured page cache size of the vdo. - * - * @param vdo The vdo - * - * @return The number of pages for the page cache - **/ -page_count_t __must_check get_vdo_configured_cache_size(const struct vdo *vdo); +bool __must_check vdo_in_recovery_mode(const struct vdo *vdo); -/** - * Get the location of the first block of the vdo. - * - * @param vdo The vdo - * - * @return The location of the first block managed by the vdo - **/ -physical_block_number_t __must_check -get_vdo_first_block_offset(const struct vdo *vdo); +void vdo_enter_recovery_mode(struct vdo *vdo); -/** - * Check whether the vdo was new when it was loaded. - * - * @param vdo The vdo to query - * - * @return true if the vdo was new - **/ -bool __must_check vdo_was_new(const struct vdo *vdo); +void vdo_assert_on_admin_thread(const struct vdo *vdo, const char *name); -/** - * Check whether a data_location containing potential dedupe advice is - * well-formed and addresses a data block in one of the configured physical - * zones of the vdo. If it is, return the location and zone as a zoned_pbn; - * otherwise increment statistics tracking invalid advice and return an - * unmapped zoned_pbn. - * - * @param vdo The vdo - * @param advice The advice to validate (NULL indicates no advice) - * @param lbn The logical block number of the write that requested advice, - * which is only used for debug-level logging of invalid advice - * - * @return The zoned_pbn representing the advice, if valid, otherwise an - * unmapped zoned_pbn if the advice was invalid or NULL - **/ -struct zoned_pbn __must_check -vdo_validate_dedupe_advice(struct vdo *vdo, - const struct data_location *advice, - logical_block_number_t lbn); +void vdo_assert_on_logical_zone_thread(const struct vdo *vdo, + zone_count_t logical_zone, + const char *name); -// TEST SUPPORT ONLY BEYOND THIS POINT +void vdo_assert_on_physical_zone_thread(const struct vdo *vdo, + zone_count_t physical_zone, + const char *name); -/** - * Dump status information about a vdo to the log for debugging. - * - * @param vdo The vdo to dump - **/ -void dump_vdo_status(const struct vdo *vdo); +static inline void vdo_assert_on_dedupe_thread(const struct vdo *vdo, + const char *name) { + ASSERT_LOG_ONLY((vdo_get_callback_thread_id() == + vdo->thread_config->dedupe_thread), + "%s called on dedupe index thread", + name); +} + +void assert_on_vdo_cpu_thread(const struct vdo *vdo, const char *name); + +struct hash_zone * __must_check +vdo_select_hash_zone(const struct vdo *vdo, const struct uds_chunk_name *name); + +int __must_check vdo_get_physical_zone(const struct vdo *vdo, + physical_block_number_t pbn, + struct physical_zone **zone_ptr); + +zone_count_t __must_check +vdo_get_bio_zone(const struct vdo *vdo, physical_block_number_t pbn); + +void vdo_dump_status(const struct vdo *vdo); + + +/* + * We start with 0L and postcondition with ~0L to match our historical usage + * in userspace. + */ +static inline u32 vdo_crc32(const void *buf, unsigned long len) +{ + return (crc32(0L, buf, len) ^ ~0L); +} #endif /* VDO_H */ diff --git a/vdo/vdoCommon.h b/vdo/vdoCommon.h deleted file mode 100644 index 01e029e7..00000000 --- a/vdo/vdoCommon.h +++ /dev/null @@ -1,30 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/kernel/vdoCommon.h#3 $ - */ - -#ifndef VDO_COMMON_H -#define VDO_COMMON_H - -enum { - // Whether the bio acknowledgement queue is used for acks of reads. - VDO_USE_BIO_ACK_QUEUE_FOR_READ = 0, -}; - -#endif /* VDO_COMMON_H */ diff --git a/vdo/vdoComponent.h b/vdo/vdoComponent.h deleted file mode 100644 index 47b5e616..00000000 --- a/vdo/vdoComponent.h +++ /dev/null @@ -1,98 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/vdoComponent.h#9 $ - */ - -#ifndef VDO_COMPONENT_H -#define VDO_COMPONENT_H - -#include "buffer.h" - -#include "types.h" -#include "vdoState.h" - -/** - * The configuration of the VDO service. - **/ -struct vdo_config { - block_count_t logical_blocks; ///< number of logical blocks - block_count_t physical_blocks; ///< number of physical blocks - block_count_t slab_size; ///< number of blocks in a slab - block_count_t recovery_journal_size; ///< number of recovery journal blocks - block_count_t slab_journal_blocks; ///< number of slab journal blocks -}; - -/** - * This is the structure that captures the vdo fields saved as a super block - * component. - **/ -struct vdo_component { - enum vdo_state state; - uint64_t complete_recoveries; - uint64_t read_only_recoveries; - struct vdo_config config; - nonce_t nonce; -}; - -/** - * Get the size of the encoded state of the vdo itself. - * - * @return the encoded size of the vdo's state - **/ -size_t __must_check get_vdo_component_encoded_size(void); - -/** - * Encode the component data for the vdo itself. - * - * @param component The component structure - * @param buffer The buffer in which to encode the vdo - * - * @return VDO_SUCCESS or an error - **/ -int __must_check -encode_vdo_component(struct vdo_component component, struct buffer *buffer); - -/** - * Decode the component data for the vdo itself from the component data buffer - * in the super block. - * - * @param buffer The buffer being decoded - * @param component The component structure in which to store - * the result of a successful decode - * - * @return VDO_SUCCESS or an error - **/ -int __must_check -decode_vdo_component(struct buffer *buffer, struct vdo_component *component); - -/** - * Validate constraints on a VDO config. - * - * @param config The VDO config - * @param block_count The block count of the VDO - * @param require_logical Set to true if the number logical blocks - * must be configured (otherwise, it may be zero) - * - * @return a success or error code - **/ -int __must_check validate_vdo_config(const struct vdo_config *config, - block_count_t block_count, - bool require_logical); - -#endif /* VDO_COMPONENT_H */ diff --git a/vdo/vdoComponentStates.c b/vdo/vdoComponentStates.c deleted file mode 100644 index ae75098d..00000000 --- a/vdo/vdoComponentStates.c +++ /dev/null @@ -1,220 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/vdoComponentStates.c#22 $ - */ - -#include "vdoComponentStates.h" - -#include "logger.h" -#include "memoryAlloc.h" -#include "permassert.h" - -#include "blockMapFormat.h" -#include "constants.h" -#include "fixedLayout.h" -#include "numUtils.h" -#include "recoveryJournalFormat.h" -#include "slabDepotFormat.h" -#include "statusCodes.h" -#include "types.h" -#include "vdoComponent.h" - -const struct version_number VDO_VOLUME_VERSION_67_0 = { - .major_version = 67, - .minor_version = 0, -}; - -/**********************************************************************/ -void destroy_vdo_component_states(struct vdo_component_states *states) -{ - if (states == NULL) { - return; - } - - free_vdo_fixed_layout(UDS_FORGET(states->layout)); -} - -/** - * Decode the components now that we know the component data is a version we - * understand. - * - * @param buffer The buffer being decoded - * @param states An object to hold the successfully decoded state - * - * @return VDO_SUCCESS or an error - **/ -static int __must_check -decode_components(struct buffer *buffer, struct vdo_component_states *states) -{ - int result = decode_vdo_component(buffer, &states->vdo); - if (result != VDO_SUCCESS) { - return result; - } - - result = decode_vdo_fixed_layout(buffer, &states->layout); - if (result != VDO_SUCCESS) { - return result; - } - - result = decode_vdo_recovery_journal_state_7_0(buffer, - &states->recovery_journal); - if (result != VDO_SUCCESS) { - return result; - } - - result = decode_vdo_slab_depot_state_2_0(buffer, &states->slab_depot); - if (result != VDO_SUCCESS) { - return result; - } - - result = decode_vdo_block_map_state_2_0(buffer, &states->block_map); - if (result != VDO_SUCCESS) { - return result; - } - - ASSERT_LOG_ONLY((content_length(buffer) == 0), - "All decoded component data was used"); - return VDO_SUCCESS; -} - -/**********************************************************************/ -int decode_vdo_component_states(struct buffer *buffer, - release_version_number_t expected_release_version, - struct vdo_component_states *states) -{ - // Check the release version against the one from the geometry. - int result = get_uint32_le_from_buffer(buffer, - &states->release_version); - if (result != VDO_SUCCESS) { - return result; - } - - if (states->release_version != expected_release_version) { - return uds_log_error_strerror(VDO_UNSUPPORTED_VERSION, - "Geometry release version %u does not match super block release version %u", - expected_release_version, - states->release_version); - } - - // Check the VDO volume version - result = decode_vdo_version_number(buffer, &states->volume_version); - if (result != VDO_SUCCESS) { - return result; - } - - result = validate_vdo_version(VDO_VOLUME_VERSION_67_0, - states->volume_version, - "volume"); - if (result != VDO_SUCCESS) { - return result; - } - - result = decode_components(buffer, states); - if (result != VDO_SUCCESS) { - free_vdo_fixed_layout(UDS_FORGET(states->layout)); - return result; - } - - return VDO_SUCCESS; -} - -/**********************************************************************/ -int validate_vdo_component_states(struct vdo_component_states *states, - nonce_t geometry_nonce, - block_count_t size) -{ - if (geometry_nonce != states->vdo.nonce) { - return uds_log_error_strerror(VDO_BAD_NONCE, - "Geometry nonce %llu does not match superblock nonce %llu", - (unsigned long long) geometry_nonce, - (unsigned long long) states->vdo.nonce); - } - - return validate_vdo_config(&states->vdo.config, size, true); -} - -/** - * Get the component data size of a vdo. - * - * @param layout The layout of the vdo - * - * @return the component data size of the vdo - **/ -static size_t __must_check get_component_data_size(struct fixed_layout *layout) -{ - return (sizeof(release_version_number_t) + - sizeof(struct packed_version_number) + - get_vdo_component_encoded_size() + - get_vdo_fixed_layout_encoded_size(layout) + - get_vdo_recovery_journal_encoded_size() + - get_vdo_slab_depot_encoded_size() + - get_vdo_block_map_encoded_size()); -} - -/**********************************************************************/ -int encode_vdo_component_states(struct buffer *buffer, - const struct vdo_component_states *states) -{ - size_t expected_size; - int result = reset_buffer_end(buffer, 0); - if (result != UDS_SUCCESS) { - return result; - } - - result = put_uint32_le_into_buffer(buffer, states->release_version); - if (result != UDS_SUCCESS) { - return result; - } - - result = encode_vdo_version_number(states->volume_version, buffer); - if (result != VDO_SUCCESS) { - return result; - } - - result = encode_vdo_component(states->vdo, buffer); - if (result != VDO_SUCCESS) { - return result; - } - - result = encode_vdo_fixed_layout(states->layout, buffer); - if (result != VDO_SUCCESS) { - return result; - } - - result = encode_vdo_recovery_journal_state_7_0(states->recovery_journal, - buffer); - if (result != VDO_SUCCESS) { - return result; - } - - result = encode_vdo_slab_depot_state_2_0(states->slab_depot, buffer); - if (result != VDO_SUCCESS) { - return result; - } - - result = encode_vdo_block_map_state_2_0(states->block_map, buffer); - if (result != VDO_SUCCESS) { - return result; - } - - expected_size = get_component_data_size(states->layout); - ASSERT_LOG_ONLY((content_length(buffer) == expected_size), - "All super block component data was encoded"); - return VDO_SUCCESS; -} diff --git a/vdo/vdoComponentStates.h b/vdo/vdoComponentStates.h deleted file mode 100644 index 368528db..00000000 --- a/vdo/vdoComponentStates.h +++ /dev/null @@ -1,117 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/vdoComponentStates.h#12 $ - */ - -#ifndef VDO_COMPONENT_STATES_H -#define VDO_COMPONENT_STATES_H - -#include "blockMapFormat.h" -#include "fixedLayout.h" -#include "recoveryJournalFormat.h" -#include "slabDepotFormat.h" -#include "types.h" -#include "vdoComponent.h" -#include "vdoState.h" - -/** - * The version of the on-disk format of a VDO volume. This should be - * incremented any time the on-disk representation of any VDO structure - * changes. Changes which require only online upgrade steps should increment - * the minor version. Changes which require an offline upgrade or which can not - * be upgraded to at all should increment the major version and set the minor - * version to 0. - **/ -extern const struct version_number VDO_VOLUME_VERSION_67_0; - -/** - * The entirety of the component data encoded in the VDO super block. - **/ -struct vdo_component_states { - /* The release version */ - release_version_number_t release_version; - - /* The VDO volume version */ - struct version_number volume_version; - - /* Components */ - struct vdo_component vdo; - struct block_map_state_2_0 block_map; - struct recovery_journal_state_7_0 recovery_journal; - struct slab_depot_state_2_0 slab_depot; - - /* Our partitioning of the underlying storage */ - struct fixed_layout *layout; -}; - -/** - * Clean up any allocations in a vdo_component_states. - * - * @param states The component states to destroy - **/ -void destroy_vdo_component_states(struct vdo_component_states *states); - -/** - * Decode the payload of a super block. - * - * @param buffer The buffer containing the encoded super - * block contents - * @param expected_release_version The required release version - * @param states A pointer to hold the decoded states - * - * @return VDO_SUCCESS or an error - **/ -int __must_check -decode_vdo_component_states(struct buffer *buffer, - release_version_number_t expected_release_version, - struct vdo_component_states *states); - -/** - * Validate the decoded super block configuration. - * - * @param states The state decoded from the super block - * @param geometry_nonce The nonce from the geometry block - * @param size The size of underlying storage - * - * @return VDO_SUCCESS or an error if the configuration is invalid - **/ -int __must_check -validate_vdo_component_states(struct vdo_component_states *states, - nonce_t geometry_nonce, - block_count_t size); - -/** - * Encode a VDO super block into a buffer for writing in the super block. - * - * @param buffer The buffer to encode into - * @param states The states of the vdo to be encoded - **/ -int __must_check -encode_vdo(struct buffer *buffer, struct vdo_component_states *states); - -/** - * Encode the state of all vdo components for writing in the super block. - * - * @param buffer The buffer to encode into - * @param states The states to encode - **/ -int encode_vdo_component_states(struct buffer *buffer, - const struct vdo_component_states *states); - -#endif /* VDO_COMPONENT_STATES_H */ diff --git a/vdo/vdoInit.c b/vdo/vdoInit.c deleted file mode 100644 index 4f144a75..00000000 --- a/vdo/vdoInit.c +++ /dev/null @@ -1,146 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/vdoInit.c#32 $ - */ - -#include "vdoInit.h" - -#include -#include -#include -#include -#include - -#include "logger.h" -#include "memoryAlloc.h" - -#include "adminCompletion.h" -#include "deviceRegistry.h" -#include "instanceNumber.h" -#include "limiter.h" -#include "poolSysfs.h" -#include "types.h" -#include "vdoInternal.h" -#include "volumeGeometry.h" - -/**********************************************************************/ -const char *get_vdo_device_name(const struct dm_target *target) -{ - return dm_device_name(dm_table_get_md(target->table)); -} - -/** - * Allocate a vdos threads, queues, and other structures which scale with the - * thread config. - * - * @param vdo The vdo being initialized - * @param reason A pointer to hold an error message on failure - * - * @return VDO_SUCCESS or an error - **/ -static int allocate_vdo_threads(struct vdo *vdo, char **reason) -{ - int i; - struct device_config *config = vdo->device_config; - int result - = make_vdo_thread_config(config->thread_counts.logical_zones, - config->thread_counts.physical_zones, - config->thread_counts.hash_zones, - &vdo->thread_config); - if (result != VDO_SUCCESS) { - *reason = "Cannot create thread configuration"; - return result; - } - - uds_log_info("zones: %d logical, %d physical, %d hash; base threads: %d", - config->thread_counts.logical_zones, - config->thread_counts.physical_zones, - config->thread_counts.hash_zones, - vdo->thread_config->base_thread_count); - - // Compression context storage - result = UDS_ALLOCATE(config->thread_counts.cpu_threads, - char *, - "LZ4 context", - &vdo->compression_context); - if (result != VDO_SUCCESS) { - *reason = "cannot allocate LZ4 context"; - return result; - } - - for (i = 0; i < config->thread_counts.cpu_threads; i++) { - result = UDS_ALLOCATE(LZ4_MEM_COMPRESS, - char, - "LZ4 context", - &vdo->compression_context[i]); - if (result != VDO_SUCCESS) { - *reason = "cannot allocate LZ4 context"; - return result; - } - } - - return VDO_SUCCESS; -} - -/**********************************************************************/ -int initialize_vdo(struct vdo *vdo, - struct device_config *config, - unsigned int instance, - char **reason) -{ - int result; - - vdo->device_config = config; - vdo->starting_sector_offset = config->owning_target->begin; - vdo->instance = instance; - vdo->allocations_allowed = true; - set_vdo_admin_state_code(&vdo->admin_state, VDO_ADMIN_STATE_NEW); - INIT_LIST_HEAD(&vdo->device_config_list); - initialize_vdo_admin_completion(vdo, &vdo->admin_completion); - mutex_init(&vdo->stats_mutex); - initialize_limiter(&vdo->request_limiter, MAXIMUM_VDO_USER_VIOS); - initialize_limiter(&vdo->discard_limiter, - MAXIMUM_VDO_USER_VIOS * 3 / 4); - - initialize_vdo_deadlock_queue(&vdo->deadlock_queue); - result = vdo_read_geometry_block(get_vdo_backing_device(vdo), - &vdo->geometry); - if (result != VDO_SUCCESS) { - *reason = "Could not load geometry block"; - destroy_vdo(vdo); - return result; - } - - result = allocate_vdo_threads(vdo, reason); - if (result != VDO_SUCCESS) { - destroy_vdo(vdo); - return result; - } - - result = register_vdo(vdo); - if (result != VDO_SUCCESS) { - *reason = "Cannot add VDO to device registry"; - destroy_vdo(vdo); - return result; - } - - set_vdo_admin_state_code(&vdo->admin_state, - VDO_ADMIN_STATE_INITIALIZED); - return result; -} diff --git a/vdo/vdoInit.h b/vdo/vdoInit.h deleted file mode 100644 index 2dab6dcf..00000000 --- a/vdo/vdoInit.h +++ /dev/null @@ -1,58 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/vdoInit.h#7 $ - */ - -#ifndef VDO_INIT_H -#define VDO_INIT_H - -#include -#include - -#include "deviceConfig.h" -#include "types.h" - - -/** - * Get the device name associated with the vdo target - * - * @param target The target device interface - * - * @return The block device name - **/ -const char * __must_check -get_vdo_device_name(const struct dm_target *target); - -/** - * Perform the first steps in initializing a vdo as part of device creation. - * - * @param vdo The vdo being initialized - * @param config The configuration of the vdo being initialized - * @param instance The device instantiation counter - * @param reason A pointer to hold an error message on failure - * - * @return VDO_SUCCESS or an error code - **/ -int __must_check -initialize_vdo(struct vdo *vdo, - struct device_config *config, - unsigned int instance, - char **reason); - -#endif // VDO_INIT_H diff --git a/vdo/vdoInternal.h b/vdo/vdoInternal.h deleted file mode 100644 index 5b44ce8a..00000000 --- a/vdo/vdoInternal.h +++ /dev/null @@ -1,409 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/vdoInternal.h#21 $ - */ - -#ifndef VDO_INTERNAL_H -#define VDO_INTERNAL_H - -#include "vdo.h" - -#include -#include -#include - -#include "deadlockQueue.h" -#include "threadRegistry.h" - -#include "adminCompletion.h" -#include "adminState.h" -#include "atomicStats.h" -#include "deviceConfig.h" -#include "header.h" -#include "limiter.h" -#include "packer.h" -#include "statistics.h" -#include "superBlock.h" -#include "readOnlyNotifier.h" -#include "types.h" -#include "uds.h" -#include "vdoComponent.h" -#include "vdoComponentStates.h" -#include "vdoLayout.h" -#include "vdoState.h" -#include "volumeGeometry.h" - - -struct vdo_thread { - struct vdo *vdo; - thread_id_t thread_id; - struct vdo_work_queue *request_queue; - struct registered_thread allocating_thread; -}; - -struct vdo { - struct vdo_thread *threads; - thread_id_t initialized_thread_count; - struct vdo_work_item work_item; - vdo_action *action; - struct vdo_completion *completion; - - /** Incoming bios we've had to buffer to avoid deadlock. */ - struct deadlock_queue deadlock_queue; - - /** - * Bio submission manager used for sending bios to the storage - * device. - **/ - struct io_submitter *io_submitter; - /** - * Work queue (possibly with multiple threads) for miscellaneous - * CPU-intensive, non-blocking work. - **/ - struct vdo_work_queue *cpu_queue; - /** Optional work queue for calling bio_endio. */ - struct vdo_work_queue *bio_ack_queue; - /** The connection to the UDS index */ - struct dedupe_index *dedupe_index; - /** The pool of data_vios for handling incoming bios */ - struct buffer_pool *data_vio_pool; - /* For returning batches of data_vios to their pool */ - struct batch_processor *data_vio_releaser; - - /* The atomic version of the state of this vdo */ - atomic_t state; - /* The full state of all components */ - struct vdo_component_states states; - /** - * A counter value to attach to thread names and log messages to - * identify the individual device. - **/ - unsigned int instance; - /* The read-only notifier */ - struct read_only_notifier *read_only_notifier; - /* The load-time configuration of this vdo */ - struct device_config *device_config; - /* The thread mapping */ - struct thread_config *thread_config; - - /* The super block */ - struct vdo_super_block *super_block; - - /* Our partitioning of the physical layer's storage */ - struct vdo_layout *layout; - - /* The block map */ - struct block_map *block_map; - - /* The journal for block map recovery */ - struct recovery_journal *recovery_journal; - - /* The slab depot */ - struct slab_depot *depot; - - /* The compressed-block packer */ - struct packer *packer; - /* Whether incoming data should be compressed */ - bool compressing; - - /* The handler for flush requests */ - struct flusher *flusher; - - /* The state the vdo was in when loaded (primarily for unit tests) */ - enum vdo_state load_state; - - /* The logical zones of this vdo */ - struct logical_zones *logical_zones; - - /* The physical zones of this vdo */ - struct physical_zone **physical_zones; - - /* The hash lock zones of this vdo */ - struct hash_zone **hash_zones; - - /* The completion for administrative operations */ - struct admin_completion admin_completion; - - /* The administrative state of the vdo */ - struct admin_state admin_state; - - /* Whether a close is required */ - bool no_flush_suspend; - bool allocations_allowed; - bool dump_on_shutdown; - atomic_t processing_message; - - // Statistics - /* Atomic stats counters */ - struct atomic_statistics stats; - /* Used to gather statistics without allocating memory */ - struct vdo_statistics stats_buffer; - /* Protects the stats_buffer */ - struct mutex stats_mutex; - /* true if sysfs directory is set up */ - bool sysfs_added; - /* Used when shutting down the sysfs statistics */ - struct completion stats_shutdown; - - - /** A list of all device_configs referencing this vdo */ - struct list_head device_config_list; - - /** This VDO's list entry for the device registry */ - struct list_head registration; - - /** Underlying block device info. */ - uint64_t starting_sector_offset; - struct volume_geometry geometry; - - // For sysfs - struct kobject vdo_directory; - struct kobject stats_directory; - - /** Limit the number of requests that are being processed. */ - struct limiter request_limiter; - struct limiter discard_limiter; - - /** N blobs of context data for LZ4 code, one per CPU thread. */ - char **compression_context; -}; - -/** - * Set a vdo's active config. - * - * @param vdo The vdo in question - * @param config The config to set - **/ -static inline void -set_vdo_active_config(struct vdo *vdo, struct device_config *config) -{ - vdo->device_config = config; -} - -/** - * Indicate whether the vdonis configured to use a separate work queue for - * acknowledging received and processed bios. - * - * Note that this directly controls the handling of write operations, but the - * compile-time flag VDO_USE_BIO_ACK_QUEUE_FOR_READ is also checked for read - * operations. - * - * @param vdo The vdo - * - * @return Whether a bio-acknowledgement work queue is in use - **/ -static inline bool use_bio_ack_queue(struct vdo *vdo) -{ - return vdo->device_config->thread_counts.bio_ack_threads > 0; -} - - -/** - * Get the current state of the vdo. This method may be called from any thread. - * - * @param vdo The vdo - * - * @return the current state of the vdo - **/ -enum vdo_state __must_check get_vdo_state(const struct vdo *vdo); - -/** - * Set the current state of the vdo. This method may be called from any thread. - * - * @param vdo The vdo whose state is to be set - * @param state The new state of the vdo - **/ -void set_vdo_state(struct vdo *vdo, enum vdo_state state); - -/** - * Encode the vdo and save the super block asynchronously. All non-user mode - * super block savers should use this bottle neck instead of calling - * save_vdo_super_block() directly. - * - * @param vdo The vdo whose state is being saved - * @param parent The completion to notify when the save is complete - **/ -void save_vdo_components(struct vdo *vdo, struct vdo_completion *parent); - -/** - * Enable a vdo to enter read-only mode on errors. - * - * @param vdo The vdo to enable - * - * @return VDO_SUCCESS or an error - **/ -int enable_read_only_entry(struct vdo *vdo); - -/** - * Get the block map. - * - * @param vdo The vdo whose block map is desired - * - * @return the block map from the vdo - **/ -struct block_map * __must_check get_block_map(const struct vdo *vdo); - -/** - * Get the slab depot from a vdo. - * - * @param vdo The vdo whose slab depot is desired - * - * @return the slab depot from the vdo - **/ -struct slab_depot * __must_check get_slab_depot(struct vdo *vdo); - -/** - * Get the recovery journal from a vdo. - * - * @param vdo The vdo whose recovery journal is desired - * - * @return the recovery journal from the vdo - **/ -struct recovery_journal * __must_check get_recovery_journal(struct vdo *vdo); - -/** - * Check whether a vdo is in read-only mode. - * - * @param vdo The vdo to query - * - * @return true if the vdo is in read-only mode - **/ -bool __must_check in_read_only_mode(const struct vdo *vdo); - -/** - * Check whether the vdo requires a read-only mode rebuild. - * - * @param vdo The vdo to query - * - * @return true if the vdo requires a read-only rebuild - **/ -bool __must_check requires_read_only_rebuild(const struct vdo *vdo); - -/** - * Check whether a vdo requires rebuilding. - * - * @param vdo The vdo to query - * - * @return true if the vdo must be rebuilt - **/ -bool __must_check requires_rebuild(const struct vdo *vdo); - -/** - * Check whether a vdo should enter recovery mode. - * - * @param vdo The vdo to query - * - * @return true if the vdo requires recovery - **/ -bool __must_check requires_recovery(const struct vdo *vdo); - -/** - * Check whether a vdo was replaying the recovery journal into the block map - * when it crashed. - * - * @param vdo The vdo to query - * - * @return true if the vdo crashed while reconstructing the - * block map - **/ -bool __must_check is_replaying(const struct vdo *vdo); - -/** - * Check whether the vdo is in recovery mode. - * - * @param vdo The vdo to query - * - * @return true if the vdo is in recovery mode - **/ -bool __must_check in_recovery_mode(const struct vdo *vdo); - -/** - * Put the vdo into recovery mode - * - * @param vdo The vdo - **/ -void enter_recovery_mode(struct vdo *vdo); - -/** - * Assert that we are running on the admin thread. - * - * @param vdo The vdo - * @param name The name of the function which should be running on the admin - * thread (for logging). - **/ -void assert_on_admin_thread(const struct vdo *vdo, const char *name); - -/** - * Assert that this function was called on the specified logical zone thread. - * - * @param vdo The vdo - * @param logical_zone The number of the logical zone - * @param name The name of the calling function - **/ -void assert_on_logical_zone_thread(const struct vdo *vdo, - zone_count_t logical_zone, - const char *name); - -/** - * Assert that this function was called on the specified physical zone thread. - * - * @param vdo The vdo - * @param physical_zone The number of the physical zone - * @param name The name of the calling function - **/ -void assert_on_physical_zone_thread(const struct vdo *vdo, - zone_count_t physical_zone, - const char *name); - -/** - * Select the hash zone responsible for locking a given chunk name. - * - * @param vdo The vdo containing the hash zones - * @param name The chunk name - * - * @return The hash zone responsible for the chunk name - **/ -struct hash_zone * __must_check -select_hash_zone(const struct vdo *vdo, const struct uds_chunk_name *name); - -/** - * Get the physical zone responsible for a given physical block number of a - * data block in this vdo instance, or of the zero block (for which a NULL - * zone is returned). For any other block number that is not in the range of - * valid data block numbers in any slab, an error will be returned. This - * function is safe to call on invalid block numbers; it will not put the vdo - * into read-only mode. - * - * @param [in] vdo The vdo containing the physical zones - * @param [in] pbn The PBN of the data block - * @param [out] zone_ptr A pointer to return the physical zone - * - * @return VDO_SUCCESS or VDO_OUT_OF_RANGE if the block number is invalid - * or an error code for any other failure - **/ -int __must_check get_physical_zone(const struct vdo *vdo, - physical_block_number_t pbn, - struct physical_zone **zone_ptr); - -/**********************************************************************/ -// Asynchronous callback to share a duplicate block. This is only public so -// test code may compare it against the current callback in the completion. -void share_block(struct vdo_completion *completion); - -#endif /* VDO_INTERNAL_H */ diff --git a/vdo/vdoLayout.c b/vdo/vdoLayout.c deleted file mode 100644 index 5de7b82c..00000000 --- a/vdo/vdoLayout.c +++ /dev/null @@ -1,316 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/vdoLayout.c#27 $ - */ - -#include "vdoLayout.h" -#include "vdoLayoutInternals.h" - -#include "logger.h" -#include "memoryAlloc.h" -#include "permassert.h" - -#include "blockMap.h" -#include "partitionCopy.h" -#include "slab.h" -#include "slabSummary.h" -#include "statusCodes.h" -#include "types.h" -#include "vdoInternal.h" - -static const enum partition_id REQUIRED_PARTITIONS[] = { - BLOCK_MAP_PARTITION, - BLOCK_ALLOCATOR_PARTITION, - RECOVERY_JOURNAL_PARTITION, - SLAB_SUMMARY_PARTITION, -}; - -static const uint8_t REQUIRED_PARTITION_COUNT = 4; - -/** - * Get the offset of a given partition. - * - * @param layout The layout containing the partition - * @param id The ID of the partition whose offset is desired - * - * @return The offset of the partition (in blocks) - **/ -static block_count_t __must_check -get_partition_offset(struct vdo_layout *layout, enum partition_id id) -{ - return get_vdo_fixed_layout_partition_offset(get_vdo_partition(layout, - id)); -} - -/**********************************************************************/ -int decode_vdo_layout(struct fixed_layout *layout, - struct vdo_layout **vdo_layout_ptr) -{ - // Check that all the expected partitions exist - struct vdo_layout *vdo_layout; - struct partition *partition; - uint8_t i; - int result; - for (i = 0; i < REQUIRED_PARTITION_COUNT; i++) { - result = vdo_get_partition(layout, REQUIRED_PARTITIONS[i], - &partition); - if (result != VDO_SUCCESS) { - return uds_log_error_strerror(result, - "VDO layout is missing required partition %u", - REQUIRED_PARTITIONS[i]); - } - } - - result = UDS_ALLOCATE(1, struct vdo_layout, __func__, &vdo_layout); - if (result != VDO_SUCCESS) { - return result; - } - - vdo_layout->layout = layout; - - // XXX Assert this is the same as where we loaded the super block. - vdo_layout->starting_offset = - get_partition_offset(vdo_layout, BLOCK_MAP_PARTITION); - - *vdo_layout_ptr = vdo_layout; - return VDO_SUCCESS; -} - -/**********************************************************************/ -void free_vdo_layout(struct vdo_layout *vdo_layout) -{ - if (vdo_layout == NULL) { - return; - } - - free_vdo_copy_completion(UDS_FORGET(vdo_layout->copy_completion)); - free_vdo_fixed_layout(UDS_FORGET(vdo_layout->next_layout)); - free_vdo_fixed_layout(UDS_FORGET(vdo_layout->layout)); - free_vdo_fixed_layout(UDS_FORGET(vdo_layout->previous_layout)); - UDS_FREE(vdo_layout); -} - -/** - * Get a partition from a fixed_layout in conditions where we expect that it can - * not fail. - * - * @param layout The fixed_layout from which to get the partition - * @param id The ID of the partition to retrieve - * - * @return The desired partition - **/ -static struct partition * __must_check -retrieve_partition(struct fixed_layout *layout, enum partition_id id) -{ - struct partition *partition; - int result = vdo_get_partition(layout, id, &partition); - ASSERT_LOG_ONLY(result == VDO_SUCCESS, - "vdo_layout has expected partition"); - return partition; -} - -/**********************************************************************/ -struct partition *get_vdo_partition(struct vdo_layout *vdo_layout, - enum partition_id id) -{ - return retrieve_partition(vdo_layout->layout, id); -} - -/** - * Get a partition from a vdo_layout's next fixed_layout. This method should - * only be called when the vdo_layout is prepared to grow. - * - * @param vdo_layout The vdo_layout from which to get the partition - * @param id The ID of the desired partition - * - * @return The requested partition - **/ -static struct partition * __must_check -get_partition_from_next_layout(struct vdo_layout *vdo_layout, - enum partition_id id) -{ - ASSERT_LOG_ONLY(vdo_layout->next_layout != NULL, - "vdo_layout is prepared to grow"); - return retrieve_partition(vdo_layout->next_layout, id); -} - -/** - * Get the size of a given partition. - * - * @param layout The layout containing the partition - * @param id The partition ID whose size to find - * - * @return The size of the partition (in blocks) - **/ -static block_count_t __must_check -get_partition_size(struct vdo_layout *layout, enum partition_id id) -{ - return get_vdo_fixed_layout_partition_size(get_vdo_partition(layout, id)); -} - -/**********************************************************************/ -int prepare_to_grow_vdo_layout(struct vdo_layout *vdo_layout, - block_count_t old_physical_blocks, - block_count_t new_physical_blocks, - struct vdo *vdo) -{ - int result; - struct partition *slab_summary_partition, *recovery_journal_partition; - block_count_t min_new_size; - - if (get_next_vdo_layout_size(vdo_layout) == new_physical_blocks) { - // We are already prepared to grow to the new size, so we're - // done. - return VDO_SUCCESS; - } - - // Make a copy completion if there isn't one - if (vdo_layout->copy_completion == NULL) { - int result = - make_vdo_copy_completion(vdo, - &vdo_layout->copy_completion); - if (result != VDO_SUCCESS) { - return result; - } - } - - // Free any unused preparation. - free_vdo_fixed_layout(UDS_FORGET(vdo_layout->next_layout)); - - // Make a new layout with the existing partition sizes for everything - // but the block allocator partition. - result = make_partitioned_vdo_fixed_layout(new_physical_blocks, - vdo_layout->starting_offset, - get_partition_size(vdo_layout, - BLOCK_MAP_PARTITION), - get_partition_size(vdo_layout, - RECOVERY_JOURNAL_PARTITION), - get_partition_size(vdo_layout, - SLAB_SUMMARY_PARTITION), - &vdo_layout->next_layout); - if (result != VDO_SUCCESS) { - free_vdo_copy_completion(UDS_FORGET(vdo_layout->copy_completion)); - return result; - } - - // Ensure the new journal and summary are entirely within the added - // blocks. - slab_summary_partition = - get_partition_from_next_layout(vdo_layout, - SLAB_SUMMARY_PARTITION); - recovery_journal_partition = - get_partition_from_next_layout(vdo_layout, - RECOVERY_JOURNAL_PARTITION); - min_new_size = - (old_physical_blocks + - get_vdo_fixed_layout_partition_size(slab_summary_partition) + - get_vdo_fixed_layout_partition_size(recovery_journal_partition)); - if (min_new_size > new_physical_blocks) { - // Copying the journal and summary would destroy some old - // metadata. - free_vdo_fixed_layout(UDS_FORGET(vdo_layout->next_layout)); - free_vdo_copy_completion(UDS_FORGET(vdo_layout->copy_completion)); - return VDO_INCREMENT_TOO_SMALL; - } - - return VDO_SUCCESS; -} - -/** - * Get the size of a VDO from the specified fixed_layout and the - * starting offset thereof. - * - * @param layout The fixed layout whose size to use - * @param starting_offset The starting offset of the layout - * - * @return The total size of a VDO (in blocks) with the given layout - **/ -static block_count_t __must_check -get_vdo_size(struct fixed_layout *layout, block_count_t starting_offset) -{ - // The fixed_layout does not include the super block or any earlier - // metadata; all that is captured in the vdo_layout's starting offset - return get_total_vdo_fixed_layout_size(layout) + starting_offset; -} - -/**********************************************************************/ -block_count_t get_next_vdo_layout_size(struct vdo_layout *vdo_layout) -{ - return ((vdo_layout->next_layout == NULL) ? - 0 : - get_vdo_size(vdo_layout->next_layout, - vdo_layout->starting_offset)); -} - -/**********************************************************************/ -block_count_t -vdo_get_next_block_allocator_partition_size(struct vdo_layout *vdo_layout) -{ - struct partition *partition; - if (vdo_layout->next_layout == NULL) { - return 0; - } - - partition = get_partition_from_next_layout(vdo_layout, - BLOCK_ALLOCATOR_PARTITION); - return get_vdo_fixed_layout_partition_size(partition); -} - -/**********************************************************************/ -block_count_t grow_vdo_layout(struct vdo_layout *vdo_layout) -{ - ASSERT_LOG_ONLY(vdo_layout->next_layout != NULL, - "VDO prepared to grow physical"); - vdo_layout->previous_layout = vdo_layout->layout; - vdo_layout->layout = vdo_layout->next_layout; - vdo_layout->next_layout = NULL; - - return get_vdo_size(vdo_layout->layout, vdo_layout->starting_offset); -} - -/**********************************************************************/ -void finish_vdo_layout_growth(struct vdo_layout *vdo_layout) -{ - if (vdo_layout->layout != vdo_layout->previous_layout) { - free_vdo_fixed_layout(UDS_FORGET(vdo_layout->previous_layout)); - } - - if (vdo_layout->layout != vdo_layout->next_layout) { - free_vdo_fixed_layout(UDS_FORGET(vdo_layout->next_layout)); - } - - free_vdo_copy_completion(UDS_FORGET(vdo_layout->copy_completion)); -} - -/**********************************************************************/ -void copy_vdo_layout_partition(struct vdo_layout *layout, - enum partition_id id, - struct vdo_completion *parent) -{ - copy_vdo_partition(layout->copy_completion, - get_vdo_partition(layout, id), - get_partition_from_next_layout(layout, id), - parent); -} - -/**********************************************************************/ -struct fixed_layout *get_vdo_fixed_layout(const struct vdo_layout *vdo_layout) -{ - return vdo_layout->layout; -} diff --git a/vdo/vdoLayout.h b/vdo/vdoLayout.h deleted file mode 100644 index f8c94a01..00000000 --- a/vdo/vdoLayout.h +++ /dev/null @@ -1,146 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/vdoLayout.h#15 $ - */ - -/** - * vdo_layout is an object which manages the layout of a VDO. It wraps - * fixed_layout, but includes the knowledge of exactly which partitions a VDO - * is expected to have. Because of this knowledge, the vdo_layout validates - * the fixed_layout encoded in the super block at load time, obviating the - * need for subsequent error checking when other modules need to get - * partitions from the layout. - * - * The vdo_layout also manages the preparation and growth of the layout for - * grow physical operations. - **/ - -#ifndef VDO_LAYOUT_H -#define VDO_LAYOUT_H - -#include "fixedLayout.h" -#include "types.h" - -/** - * Make a vdo_layout from the fixed_layout decoded from the super block. - * - * @param [in] layout The fixed_layout from the super block - * @param [out] vdo_layout_ptr A pointer to hold the vdo_layout - * - * @return VDO_SUCCESS or an error - **/ -int __must_check decode_vdo_layout(struct fixed_layout *layout, - struct vdo_layout **vdo_layout_ptr); - -/** - * Free a vdo_layout. - * - * @param vdo_layout The vdo_layout to free - **/ -void free_vdo_layout(struct vdo_layout *vdo_layout); - -/** - * Get a partition from a vdo_layout. Because the layout's fixed_layout has - * already been validated, this can not fail. - * - * @param vdo_layout The vdo_layout from which to get the partition - * @param id The ID of the desired partition - * - * @return The requested partition - **/ -struct partition * __must_check -get_vdo_partition(struct vdo_layout *vdo_layout, enum partition_id id); - -/** - * Prepare the layout to be grown. - * - * @param vdo_layout The layout to grow - * @param old_physical_blocks The current size of the VDO - * @param new_physical_blocks The size to which the VDO will be grown - * @param vdo The VDO being grown - * - * @return VDO_SUCCESS or an error code - **/ -int __must_check -prepare_to_grow_vdo_layout(struct vdo_layout *vdo_layout, - block_count_t old_physical_blocks, - block_count_t new_physical_blocks, - struct vdo *vdo); - -/** - * Get the size of the next layout. - * - * @param vdo_layout The layout to check - * - * @return The size which was specified when the layout was prepared for growth - * or 0 if the layout is not prepared to grow - **/ -block_count_t __must_check -get_next_vdo_layout_size(struct vdo_layout *vdo_layout); - -/** - * Get the size of the next block allocator partition. - * - * @param vdo_layout The vdo_layout which has been prepared to grow - * - * @return The size of the block allocator partition in the next layout or 0 - * if the layout is not prepared to grow - **/ -block_count_t __must_check -vdo_get_next_block_allocator_partition_size(struct vdo_layout *vdo_layout); - -/** - * Grow the layout by swapping in the prepared layout. - * - * @param vdo_layout The layout to grow - * - * @return The new size of the VDO - **/ -block_count_t __must_check grow_vdo_layout(struct vdo_layout *vdo_layout); - -/** - * Clean up any unused resources once an attempt to grow has completed. - * - * @param vdo_layout The layout - **/ -void finish_vdo_layout_growth(struct vdo_layout *vdo_layout); - -/** - * Copy a partition from the location specified in the current layout to that in - * the next layout. - * - * @param layout The vdo_layout which is prepared to grow - * @param id The ID of the partition to copy - * @param parent The completion to notify when the copy is complete - **/ -void copy_vdo_layout_partition(struct vdo_layout *layout, - enum partition_id id, - struct vdo_completion *parent); - -/** - * Get the current fixed layout of the vdo. - * - * @param vdo_layout The layout - * - * @return The layout's current fixed layout - **/ -struct fixed_layout * __must_check -get_vdo_fixed_layout(const struct vdo_layout *vdo_layout); - -#endif // VDO_LAYOUT_H diff --git a/vdo/vdoLayoutInternals.h b/vdo/vdoLayoutInternals.h deleted file mode 100644 index c219840f..00000000 --- a/vdo/vdoLayoutInternals.h +++ /dev/null @@ -1,41 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/vdoLayoutInternals.h#2 $ - */ - -#ifndef VDO_LAYOUT_INTERNALS_H -#define VDO_LAYOUT_INTERNALS_H - -#include "fixedLayout.h" -#include "types.h" - -struct vdo_layout { - // The current layout of the VDO - struct fixed_layout *layout; - // The next layout of the VDO - struct fixed_layout *next_layout; - // The previous layout of the VDO - struct fixed_layout *previous_layout; - // The first block in the layouts - physical_block_number_t starting_offset; - // A pointer to the copy completion (if there is one) - struct vdo_completion *copy_completion; -}; - -#endif // VDO_LAYOUT_INTERNALS_H diff --git a/vdo/vdoLoad.c b/vdo/vdoLoad.c deleted file mode 100644 index 2ac72cbb..00000000 --- a/vdo/vdoLoad.c +++ /dev/null @@ -1,547 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/vdoLoad.c#59 $ - */ - -#include "vdoLoad.h" - -#include "logger.h" -#include "memoryAlloc.h" - -#include "adminCompletion.h" -#include "blockMap.h" -#include "completion.h" -#include "constants.h" -#include "dedupeIndex.h" -#include "deviceConfig.h" -#include "hashZone.h" -#include "header.h" -#include "logicalZone.h" -#include "physicalZone.h" -#include "poolSysfs.h" -#include "readOnlyRebuild.h" -#include "recoveryJournal.h" -#include "releaseVersions.h" -#include "slabDepot.h" -#include "slabSummary.h" -#include "superBlockCodec.h" -#include "threadConfig.h" -#include "types.h" -#include "vdoInternal.h" -#include "vdoRecovery.h" - -enum { - LOAD_PHASE_START = 0, - LOAD_PHASE_STATS, - LOAD_PHASE_LOAD_DEPOT, - LOAD_PHASE_MAKE_DIRTY, - LOAD_PHASE_PREPARE_TO_ALLOCATE, - LOAD_PHASE_SCRUB_SLABS, - LOAD_PHASE_DATA_REDUCTION, - LOAD_PHASE_FINISHED, - LOAD_PHASE_DRAIN_JOURNAL, - LOAD_PHASE_WAIT_FOR_READ_ONLY, -}; - -static const char *LOAD_PHASE_NAMES[] = { - "LOAD_PHASE_START", - "LOAD_PHASE_STATS", - "LOAD_PHASE_LOAD_DEPOT", - "LOAD_PHASE_MAKE_DIRTY", - "LOAD_PHASE_PREPARE_TO_ALLOCATE", - "LOAD_PHASE_SCRUB_SLABS", - "LOAD_PHASE_DATA_REDUCTION", - "LOAD_PHASE_FINISHED", - "LOAD_PHASE_DRAIN_JOURNAL", - "LOAD_PHASE_WAIT_FOR_READ_ONLY", -}; - - -/** - * Implements vdo_thread_id_getter_for_phase. - **/ -static thread_id_t __must_check -get_thread_id_for_phase(struct admin_completion *admin_completion) -{ - const struct thread_config *thread_config = - get_vdo_thread_config(admin_completion->vdo); - switch (admin_completion->phase) { - case LOAD_PHASE_DRAIN_JOURNAL: - return thread_config->journal_thread; - - default: - return thread_config->admin_thread; - } -} - -/** - * Extract the vdo from an admin_completion, checking that the current - * operation is a load. - * - * @param completion The admin_completion's sub-task completion - * - * @return The vdo - **/ -static inline struct vdo * -vdo_from_load_sub_task(struct vdo_completion *completion) -{ - return vdo_from_admin_sub_task(completion, VDO_ADMIN_OPERATION_LOAD); -} - -/** - * Determine how the slab depot was loaded. - * - * @param vdo The vdo - * - * @return How the depot was loaded - **/ -static enum slab_depot_load_type get_load_type(struct vdo *vdo) -{ - if (requires_read_only_rebuild(vdo)) { - return VDO_SLAB_DEPOT_REBUILD_LOAD; - } - - if (requires_recovery(vdo)) { - return VDO_SLAB_DEPOT_RECOVERY_LOAD; - } - - return VDO_SLAB_DEPOT_NORMAL_LOAD; -} - -/** - * Initialize the vdo sysfs directory. - * - * @param vdo The vdo being initialized - * - * @return VDO_SUCCESS or an error code - **/ -static int initialize_vdo_kobjects(struct vdo *vdo) -{ - int result; - struct dm_target *target = vdo->device_config->owning_target; - struct mapped_device *md = dm_table_get_md(target->table); - - kobject_init(&vdo->vdo_directory, &vdo_directory_type); - vdo->sysfs_added = true; - result = kobject_add(&vdo->vdo_directory, - &disk_to_dev(dm_disk(md))->kobj, - "vdo"); - if (result != 0) { - return VDO_CANT_ADD_SYSFS_NODE; - } - - result = add_vdo_dedupe_index_sysfs(vdo->dedupe_index, - &vdo->vdo_directory); - if (result != 0) { - return VDO_CANT_ADD_SYSFS_NODE; - } - - return add_vdo_sysfs_stats_dir(vdo); -} - -/** - * Callback to do the destructive parts of loading a VDO. - * - * @param completion The sub-task completion - **/ -static void load_callback(struct vdo_completion *completion) -{ - struct admin_completion *admin_completion = - vdo_admin_completion_from_sub_task(completion); - struct vdo *vdo = vdo_from_load_sub_task(completion); - assert_vdo_admin_operation_type(admin_completion, - VDO_ADMIN_OPERATION_LOAD); - assert_vdo_admin_phase_thread(admin_completion, - __func__, - LOAD_PHASE_NAMES); - - switch (admin_completion->phase++) { - case LOAD_PHASE_START: - if (!start_vdo_operation_with_waiter(&vdo->admin_state, - VDO_ADMIN_STATE_LOADING, - &admin_completion->completion, - NULL)) { - return; - } - - // Prepare the recovery journal for new entries. - open_vdo_recovery_journal(vdo->recovery_journal, - vdo->depot, - vdo->block_map); - vdo_allow_read_only_mode_entry(vdo->read_only_notifier, - reset_vdo_admin_sub_task(completion)); - return; - - case LOAD_PHASE_STATS: - finish_vdo_completion(reset_vdo_admin_sub_task(completion), - initialize_vdo_kobjects(vdo)); - return; - - case LOAD_PHASE_LOAD_DEPOT: - if (vdo_is_read_only(vdo->read_only_notifier)) { - /* - * In read-only mode we don't use the allocator and it - * may not even be readable, so don't bother trying to - * load it. - */ - set_vdo_operation_result(&vdo->admin_state, - VDO_READ_ONLY); - break; - } - - reset_vdo_admin_sub_task(completion); - if (requires_read_only_rebuild(vdo)) { - launch_vdo_rebuild(vdo, completion); - return; - } - - if (requires_rebuild(vdo)) { - vdo_launch_recovery(vdo, completion); - return; - } - - load_vdo_slab_depot(vdo->depot, - (vdo_was_new(vdo) - ? VDO_ADMIN_STATE_FORMATTING - : VDO_ADMIN_STATE_LOADING), - completion, - NULL); - return; - - case LOAD_PHASE_MAKE_DIRTY: - set_vdo_state(vdo, VDO_DIRTY); - save_vdo_components(vdo, reset_vdo_admin_sub_task(completion)); - return; - - case LOAD_PHASE_PREPARE_TO_ALLOCATE: - initialize_vdo_block_map_from_journal(vdo->block_map, - vdo->recovery_journal); - prepare_vdo_slab_depot_to_allocate(vdo->depot, - get_load_type(vdo), - reset_vdo_admin_sub_task(completion)); - return; - - case LOAD_PHASE_SCRUB_SLABS: - if (requires_recovery(vdo)) { - enter_recovery_mode(vdo); - } - - vdo_scrub_all_unrecovered_slabs(vdo->depot, - reset_vdo_admin_sub_task(completion)); - return; - - case LOAD_PHASE_DATA_REDUCTION: - WRITE_ONCE(vdo->compressing, vdo->device_config->compression); - fallthrough; - - case LOAD_PHASE_FINISHED: - break; - - case LOAD_PHASE_DRAIN_JOURNAL: - drain_vdo_recovery_journal(vdo->recovery_journal, - VDO_ADMIN_STATE_SAVING, - reset_vdo_admin_sub_task(completion)); - return; - - case LOAD_PHASE_WAIT_FOR_READ_ONLY: - admin_completion->phase = LOAD_PHASE_FINISHED; - reset_vdo_admin_sub_task(completion); - vdo_wait_until_not_entering_read_only_mode(vdo->read_only_notifier, - completion); - return; - - default: - set_vdo_completion_result(reset_vdo_admin_sub_task(completion), - UDS_BAD_STATE); - } - - finish_vdo_operation(&vdo->admin_state, completion->result); -} - -/** - * Handle an error during the load operation. If at all possible, bring the vdo - * online in read-only mode. This handler is registered in load_vdo(). - * - * @param completion The sub-task completion - **/ -static void handle_load_error(struct vdo_completion *completion) -{ - struct admin_completion *admin_completion = - vdo_admin_completion_from_sub_task(completion); - struct vdo *vdo = vdo_from_load_sub_task(completion); - assert_vdo_admin_operation_type(admin_completion, - VDO_ADMIN_OPERATION_LOAD); - - if (requires_read_only_rebuild(vdo) - && (admin_completion->phase == LOAD_PHASE_MAKE_DIRTY)) { - uds_log_error_strerror(completion->result, "aborting load"); - - // Preserve the error. - set_vdo_operation_result(&vdo->admin_state, - completion->result); - admin_completion->phase = LOAD_PHASE_DRAIN_JOURNAL; - load_callback(UDS_FORGET(completion)); - return; - } - - uds_log_error_strerror(completion->result, - "Entering read-only mode due to load error"); - admin_completion->phase = LOAD_PHASE_WAIT_FOR_READ_ONLY; - vdo_enter_read_only_mode(vdo->read_only_notifier, completion->result); - set_vdo_operation_result(&vdo->admin_state, VDO_READ_ONLY); - load_callback(completion); -} - -/**********************************************************************/ -int load_vdo(struct vdo *vdo) -{ - return perform_vdo_admin_operation(vdo, - VDO_ADMIN_OPERATION_LOAD, - get_thread_id_for_phase, - load_callback, - handle_load_error); -} - -/** - * Decode the VDO state from the super block and validate that it is correct. - * On error from this method, the component states must be destroyed - * explicitly. If this method returns successfully, the component states must - * not be destroyed. - * - * @param vdo The vdo being loaded - * - * @return VDO_SUCCESS or an error - **/ -static int __must_check decode_from_super_block(struct vdo *vdo) -{ - block_count_t block_count; - struct super_block_codec *codec - = get_vdo_super_block_codec(vdo->super_block); - int result = decode_vdo_component_states(codec->component_buffer, - vdo->geometry.release_version, - &vdo->states); - if (result != VDO_SUCCESS) { - return result; - } - - set_vdo_state(vdo, vdo->states.vdo.state); - vdo->load_state = vdo->states.vdo.state; - - block_count = vdo->device_config->physical_blocks; - result = validate_vdo_component_states(&vdo->states, - vdo->geometry.nonce, - block_count); - if (result != VDO_SUCCESS) { - return result; - } - - return decode_vdo_layout(vdo->states.layout, &vdo->layout); -} - -/** - * Decode the component data portion of a super block and fill in the - * corresponding portions of the vdo being loaded. This will also allocate the - * recovery journal and slab depot. If this method is called with an - * asynchronous layer (i.e. a thread config which specifies at least one base - * thread), the block map and packer will be constructed as well. - * - * @param vdo The vdo being loaded - * - * @return VDO_SUCCESS or an error - **/ -static int __must_check decode_vdo(struct vdo *vdo) -{ - block_count_t maximum_age, journal_length; - const struct thread_config *thread_config = get_vdo_thread_config(vdo); - zone_count_t zone; - int result = decode_from_super_block(vdo); - if (result != VDO_SUCCESS) { - destroy_vdo_component_states(&vdo->states); - return result; - } - - maximum_age = get_vdo_configured_block_map_maximum_age(vdo); - journal_length = - get_vdo_recovery_journal_length(vdo->states.vdo.config.recovery_journal_size); - if ((maximum_age > (journal_length / 2)) || (maximum_age < 1)) { - return VDO_BAD_CONFIGURATION; - } - - result = make_vdo_read_only_notifier(in_read_only_mode(vdo), - thread_config, - vdo, - &vdo->read_only_notifier); - if (result != VDO_SUCCESS) { - return result; - } - - result = enable_read_only_entry(vdo); - if (result != VDO_SUCCESS) { - return result; - } - - result = decode_vdo_recovery_journal(vdo->states.recovery_journal, - vdo->states.vdo.nonce, - vdo, - get_vdo_partition(vdo->layout, - RECOVERY_JOURNAL_PARTITION), - vdo->states.vdo.complete_recoveries, - vdo->states.vdo.config.recovery_journal_size, - VDO_RECOVERY_JOURNAL_TAIL_BUFFER_SIZE, - vdo->read_only_notifier, - thread_config, - &vdo->recovery_journal); - if (result != VDO_SUCCESS) { - return result; - } - - result = decode_vdo_slab_depot(vdo->states.slab_depot, - vdo, - get_vdo_partition(vdo->layout, - SLAB_SUMMARY_PARTITION), - &vdo->depot); - if (result != VDO_SUCCESS) { - return result; - } - - result = decode_vdo_block_map(vdo->states.block_map, - vdo->states.vdo.config.logical_blocks, - thread_config, - vdo, - vdo->read_only_notifier, - vdo->recovery_journal, - vdo->states.vdo.nonce, - get_vdo_configured_cache_size(vdo), - maximum_age, - &vdo->block_map); - if (result != VDO_SUCCESS) { - return result; - } - - result = make_vdo_flusher(vdo); - if (result != VDO_SUCCESS) { - return result; - } - - result = UDS_ALLOCATE(thread_config->hash_zone_count, - struct hash_zone *, - __func__, - &vdo->hash_zones); - if (result != VDO_SUCCESS) { - return result; - } - - for (zone = 0; zone < thread_config->hash_zone_count; zone++) { - result = make_vdo_hash_zone(vdo, zone, &vdo->hash_zones[zone]); - if (result != VDO_SUCCESS) { - return result; - } - } - - result = make_vdo_logical_zones(vdo, &vdo->logical_zones); - if (result != VDO_SUCCESS) { - return result; - } - - result = UDS_ALLOCATE(thread_config->physical_zone_count, - struct physical_zone *, - __func__, - &vdo->physical_zones); - if (result != VDO_SUCCESS) { - return result; - } - - for (zone = 0; zone < thread_config->physical_zone_count; zone++) { - result = make_vdo_physical_zone(vdo, zone, - &vdo->physical_zones[zone]); - if (result != VDO_SUCCESS) { - return result; - } - } - - return make_vdo_packer(vdo, - DEFAULT_PACKER_INPUT_BINS, - DEFAULT_PACKER_OUTPUT_BINS, - &vdo->packer); -} - -/** - * Callback to finish the load operation. - * - * @param completion The admin_completion's sub-task completion - **/ -static void finish_operation_callback(struct vdo_completion *completion) -{ - struct vdo *vdo = vdo_from_load_sub_task(completion); - finish_vdo_operation(&vdo->admin_state, completion->result); -} - -/** - * Load the components of a VDO. This is the super block load callback - * set by load_callback(). - * - * @param completion The sub-task completion - **/ -static void load_vdo_components(struct vdo_completion *completion) -{ - struct vdo *vdo = vdo_from_load_sub_task(completion); - prepare_vdo_admin_sub_task(vdo, - finish_operation_callback, - finish_operation_callback); - finish_vdo_completion(completion, decode_vdo(vdo)); -} - -/** - * Callback to initiate a pre-load, registered in prepare_to_load_vdo(). - * - * @param completion The sub-task completion - **/ -static void pre_load_callback(struct vdo_completion *completion) -{ - struct admin_completion *admin_completion = - vdo_admin_completion_from_sub_task(completion); - struct vdo *vdo = vdo_from_load_sub_task(completion); - - ASSERT_LOG_ONLY((admin_completion->type == VDO_ADMIN_OPERATION_LOAD), - "unexpected admin operation type %u when preloading", - admin_completion->type); - assert_on_admin_thread(vdo, __func__); - if (!start_vdo_operation_with_waiter(&vdo->admin_state, - VDO_ADMIN_STATE_PRE_LOADING, - &admin_completion->completion, - NULL)) { - return; - } - - prepare_vdo_admin_sub_task(vdo, - load_vdo_components, - finish_operation_callback); - load_vdo_super_block(vdo, completion, get_vdo_first_block_offset(vdo), - &vdo->super_block); -} - -/**********************************************************************/ -int prepare_to_load_vdo(struct vdo *vdo) -{ - return perform_vdo_admin_operation(vdo, - VDO_ADMIN_OPERATION_LOAD, - NULL, - pre_load_callback, - pre_load_callback); -} diff --git a/vdo/vdoLoad.h b/vdo/vdoLoad.h deleted file mode 100644 index a277b133..00000000 --- a/vdo/vdoLoad.h +++ /dev/null @@ -1,46 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/vdoLoad.h#8 $ - */ - -#ifndef VDO_LOAD_H -#define VDO_LOAD_H - -#include "types.h" - -/** - * Load a vdo for normal operation. This method must not be called from a base - * thread. - * - * @param vdo The vdo to load - * - * @return VDO_SUCCESS or an error - **/ -int __must_check load_vdo(struct vdo *vdo); - -/** - * Perpare a vdo for loading by reading structures off disk. This method does - * not alter the on-disk state. It should be called from the vdo constructor, - * whereas perform_vdo_load() will be called during pre-resume if the vdo has - * not been resumed before. - **/ -int __must_check -prepare_to_load_vdo(struct vdo *vdo); - -#endif /* VDO_LOAD_H */ diff --git a/vdo/vdoPageCache.h b/vdo/vdoPageCache.h deleted file mode 100644 index c4366d73..00000000 --- a/vdo/vdoPageCache.h +++ /dev/null @@ -1,334 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/vdoPageCache.h#9 $ - */ - -#ifndef VDO_PAGE_CACHE_H -#define VDO_PAGE_CACHE_H - -#include "adminState.h" -#include "completion.h" -#include "statistics.h" -#include "types.h" -#include "waitQueue.h" - -/** - * Structure describing page meta data (defined internally). - **/ -struct page_info; - -/** - * Structure describing entire page cache. - **/ -struct vdo_page_cache; - -/** - * Generation counter for page references. - **/ -typedef uint32_t vdo_page_generation; - -/** - * Signature for a function to call when a page is read into the cache. - * - *

    If specified, this function is called when a page is fetched from disk. - * - * @param raw_page The raw memory of the freshly-fetched page - * @param pbn The absolute physical block number of the page - * @param zone The block map zone to which the cache belongs - * @param page_context A pointer to client-specific data for the new page - * - * @return VDO_SUCCESS on success or VDO_BAD_PAGE if the page is incorrectly - * formatted - **/ -typedef int vdo_page_read_function(void *raw_page, - physical_block_number_t pbn, - struct block_map_zone *zone, - void *page_context); - -/** - * Signature for a function to call when a page is written from the cache. - * - *

    If specified, this function is called when a page is written to disk. - * - * @param raw_page The raw memory of the freshly-written page - * @param zone The block map zone to which the cache belongs - * @param page_context A pointer to client-specific data for the new page - * - * @return whether the page needs to be rewritten - **/ -typedef bool vdo_page_write_function(void *raw_page, - struct block_map_zone *zone, - void *page_context); - -/** - * Construct a page cache. - * - * @param [in] vdo The vdo - * @param [in] page_count The number of cache pages to hold - * @param [in] read_hook The function to be called when a page is - * read into the cache - * @param [in] write_hook The function to be called after a page is - * written from the cache - * @param [in] page_context_size The size of the per-page context that will - * be passed to the read and write hooks - * @param [in] maximum_age The number of journal blocks before a - * dirtied page is considered old and must be - * written out - * @param [in] zone The block map zone which owns this cache - * @param [out] cache_ptr A pointer to hold the cache - * - * @return a success or error code - **/ -int __must_check make_vdo_page_cache(struct vdo *vdo, - page_count_t page_count, - vdo_page_read_function *read_hook, - vdo_page_write_function *write_hook, - size_t page_context_size, - block_count_t maximum_age, - struct block_map_zone *zone, - struct vdo_page_cache **cache_ptr); - -/** - * Free the page cache structure. - * - * @param cache The cache to free - **/ -void free_vdo_page_cache(struct vdo_page_cache *cache); - -/** - * Set the initial dirty period for a page cache. - * - * @param cache The cache - * @param period The initial dirty period to set - **/ -void set_vdo_page_cache_initial_period(struct vdo_page_cache *cache, - sequence_number_t period); - -/** - * Switch the page cache into or out of read-only rebuild mode. - * - * @param cache The cache - * @param rebuilding true if the cache should be put into - * read-only rebuild mode, false otherwise - **/ -void set_vdo_page_cache_rebuild_mode(struct vdo_page_cache *cache, - bool rebuilding); - -/** - * Check whether a page cache is active (i.e. has any active lookups, - * outstanding I/O, or pending I/O). - * - * @param cache The cache to check - * - * @return true if the cache is active - **/ -bool __must_check is_vdo_page_cache_active(struct vdo_page_cache *cache); - -/** - * Advance the dirty period for a page cache. - * - * @param cache The cache to advance - * @param period The new dirty period - **/ -void advance_vdo_page_cache_period(struct vdo_page_cache *cache, - sequence_number_t period); - -/** - * Write one or more batches of dirty pages. - * - * All writable pages in the ancient era and some number in the old era - * are scheduled for writing. - * - * @param cache the VDO page cache - * @param batches how many batches to write now - * @param total how many batches (including those being written now) remain - * in this era - **/ -void write_vdo_page_cache_pages(struct vdo_page_cache *cache, - size_t batches, - size_t total); - -/** - * Rotate the dirty page eras. - * - * Move all pages in the old era to the ancient era and then move - * the current era bin into the old era. - * - * @param cache the VDO page cache - **/ -void rotate_vdo_page_cache_eras(struct vdo_page_cache *cache); - -// ASYNC - -/** - * A completion awaiting a specific page. Also a live reference into the - * page once completed, until freed. - **/ -struct vdo_page_completion { - /** The generic completion */ - struct vdo_completion completion; - /** The cache involved */ - struct vdo_page_cache *cache; - /** The waiter for the pending list */ - struct waiter waiter; - /** The absolute physical block number of the page on disk */ - physical_block_number_t pbn; - /** Whether the page may be modified */ - bool writable; - /** Whether the page is available */ - bool ready; - /** The info structure for the page, only valid when ready */ - struct page_info *info; -}; - -/** - * Initialize a VDO Page Completion, requesting a particular page from the - * cache. - * - * @param page_completion The vdo_page_completion to initialize - * @param cache The VDO page cache - * @param pbn The absolute physical block of the desired page - * @param writable Whether the page can be modified - * @param parent The parent object - * @param callback The completion callback - * @param error_handler The handler for page errors - * - * @note Once a completion has occurred for the get_vdo_page() operation, - * the underlying page shall be busy (stuck in memory) until the - * vdo_completion returned by this operation has been released. - **/ -void init_vdo_page_completion(struct vdo_page_completion *page_completion, - struct vdo_page_cache *cache, - physical_block_number_t pbn, - bool writable, - void *parent, - vdo_action *callback, - vdo_action *error_handler); - -/** - * Release a VDO Page Completion. - * - * The page referenced by this completion (if any) will no longer be - * held busy by this completion. If a page becomes discardable and - * there are completions awaiting free pages then a new round of - * page discarding is started. - * - * @param completion The completion to release - **/ -void release_vdo_page_completion(struct vdo_completion *completion); - -/** - * Asynchronous operation to get a VDO page. - * - * May cause another page to be discarded (potentially writing a dirty page) - * and the one nominated by the completion to be loaded from disk. - * - * When the page becomes available the callback registered in the completion - * provided is triggered. Once triggered the page is marked busy until - * the completion is destroyed. - * - * @param completion the completion initialized by - * init_vdo_page_completion(). - **/ -void get_vdo_page(struct vdo_completion *completion); - -/** - * Mark a VDO page referenced by a completed vdo_page_completion as dirty. - * - * @param completion a VDO Page Completion whose callback has been - * called - * @param old_dirty_period the period in which the page was already dirty (0 - * if it wasn't) - * @param new_dirty_period the period in which the page is now dirty - **/ -void mark_completed_vdo_page_dirty(struct vdo_completion *completion, - sequence_number_t old_dirty_period, - sequence_number_t new_dirty_period); - -/** - * Request that a VDO page be written out as soon as it is not busy. - * - * @param completion the vdo_page_completion containing the page - **/ -void request_vdo_page_write(struct vdo_completion *completion); - -/** - * Access the raw memory for a read-only page of a completed - * vdo_page_completion. - * - * @param completion a vdo page completion whose callback has been called - * - * @return a pointer to the raw memory at the beginning of the page, or - * NULL if the page is not available. - **/ -const void *dereference_readable_vdo_page(struct vdo_completion *completion); - -/** - * Access the raw memory for a writable page of a completed - * vdo_page_completion. - * - * @param completion a vdo page completion whose callback has been called - * - * @return a pointer to the raw memory at the beginning of the page, or - * NULL if the page is not available, or if the page is read-only - **/ -void *dereference_writable_vdo_page(struct vdo_completion *completion); - -/** - * Get the per-page client context for the page in a page completion whose - * callback has been invoked. Should only be called after dereferencing the - * page completion to validate the page. - * - * @param completion a vdo page completion whose callback has been invoked - * - * @return a pointer to the per-page client context, or NULL if - * the page is not available - **/ -void *get_vdo_page_completion_context(struct vdo_completion *completion); - -/** - * Drain I/O for a page cache. - * - * @param cache The cache to drain - **/ -void drain_vdo_page_cache(struct vdo_page_cache *cache); - -/** - * Invalidate all entries in the VDO page cache. There must not be any - * dirty pages in the cache. - * - * @param cache the cache to invalidate - * - * @return a success or error code - **/ -int __must_check invalidate_vdo_page_cache(struct vdo_page_cache *cache); - -// STATISTICS & TESTING - -/** - * Get current cache statistics. - * - * @param cache the page cache - * - * @return the statistics - **/ -struct block_map_statistics __must_check -get_vdo_page_cache_statistics(const struct vdo_page_cache *cache); - -#endif // VDO_PAGE_CACHE_H diff --git a/vdo/vdoPageCacheInternals.h b/vdo/vdoPageCacheInternals.h deleted file mode 100644 index 27442c03..00000000 --- a/vdo/vdoPageCacheInternals.h +++ /dev/null @@ -1,186 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/vdoPageCacheInternals.h#4 $ - */ - -#ifndef VDO_PAGE_CACHE_INTERNALS_H -#define VDO_PAGE_CACHE_INTERNALS_H - -#include "vdoPageCache.h" - -#include - - -#include "blockMapInternals.h" -#include "completion.h" -#include "dirtyLists.h" -#include "intMap.h" - -enum { - MAX_PAGE_CONTEXT_SIZE = 8, -}; - -static const physical_block_number_t NO_PAGE = 0xFFFFFFFFFFFFFFFF; - -/** - * The VDO Page Cache abstraction. - **/ -struct vdo_page_cache { - /** the VDO which owns this cache */ - struct vdo *vdo; - /** number of pages in cache */ - page_count_t page_count; - /** function to call on page read */ - vdo_page_read_function *read_hook; - /** function to call on page write */ - vdo_page_write_function *write_hook; - /** number of pages to write in the current batch */ - page_count_t pages_in_batch; - /** Whether the VDO is doing a read-only rebuild */ - bool rebuilding; - - /** array of page information entries */ - struct page_info *infos; - /** raw memory for pages */ - char *pages; - /** cache last found page info */ - struct page_info *last_found; - /** map of page number to info */ - struct int_map *page_map; - /** main LRU list (all infos) */ - struct list_head lru_list; - /** dirty pages by period */ - struct dirty_lists *dirty_lists; - /** free page list (oldest first) */ - struct list_head free_list; - /** outgoing page list */ - struct list_head outgoing_list; - /** number of read I/O operations pending */ - page_count_t outstanding_reads; - /** number of write I/O operations pending */ - page_count_t outstanding_writes; - /** number of pages covered by the current flush */ - page_count_t pages_in_flush; - /** number of pages waiting to be included in the next flush */ - page_count_t pages_to_flush; - /** number of discards in progress */ - unsigned int discard_count; - /** how many VPCs waiting for free page */ - unsigned int waiter_count; - /** queue of waiters who want a free page */ - struct wait_queue free_waiters; - /** - * Statistics are only updated on the logical zone thread, but are - * accessed from other threads. - **/ - struct block_map_statistics stats; - /** counter for pressure reports */ - uint32_t pressure_report; - /** the block map zone to which this cache belongs */ - struct block_map_zone *zone; -}; - -/** - * The state of a page buffer. If the page buffer is free no particular page is - * bound to it, otherwise the page buffer is bound to particular page whose - * absolute pbn is in the pbn field. If the page is resident or dirty the page - * data is stable and may be accessed. Otherwise the page is in flight - * (incoming or outgoing) and its data should not be accessed. - * - * @note Update the static data in get_page_state_name() if you change this - * enumeration. - **/ -enum vdo_page_buffer_state { - /* this page buffer is not being used */ - PS_FREE, - /* this page is being read from store */ - PS_INCOMING, - /* attempt to load this page failed */ - PS_FAILED, - /* this page is valid and un-modified */ - PS_RESIDENT, - /* this page is valid and modified */ - PS_DIRTY, - /* this page is being written and should not be used */ - PS_OUTGOING, - /* not a state */ - PAGE_STATE_COUNT, -} __packed; - -/** - * The write status of page - **/ -enum vdo_page_write_status { - WRITE_STATUS_NORMAL, - WRITE_STATUS_DISCARD, - WRITE_STATUS_DEFERRED, -} __packed; - -/** - * Per-page-slot information. - **/ -struct page_info { - /** Preallocated page struct vio */ - struct vio *vio; - /** back-link for references */ - struct vdo_page_cache *cache; - /** the pbn of the page */ - physical_block_number_t pbn; - /** page is busy (temporarily locked) */ - uint16_t busy; - /** the write status the page */ - enum vdo_page_write_status write_status; - /** page state */ - enum vdo_page_buffer_state state; - /** queue of completions awaiting this item */ - struct wait_queue waiting; - /** state linked list entry */ - struct list_head state_entry; - /** LRU entry */ - struct list_head lru_entry; - /** Space for per-page client data */ - byte context[MAX_PAGE_CONTEXT_SIZE]; -}; - -/**********************************************************************/ -static inline bool is_vdo_page_dirty(const struct page_info *info) -{ - return info->state == PS_DIRTY; -} - -/**********************************************************************/ -static inline struct vdo_page_completion * -as_vdo_page_completion(struct vdo_completion *completion) -{ - assert_vdo_completion_type(completion->type, VDO_PAGE_COMPLETION); - return container_of(completion, struct vdo_page_completion, completion); -} - -/** - * Find the page info (if any) associated with a given pbn. - * - * @param cache the page cache - * @param pbn the absolute physical block number of the page - * - * @return the page info for the page if available, or NULL if not - **/ -struct page_info * __must_check -vdo_page_cache_find_page(struct vdo_page_cache *cache, physical_block_number_t pbn); - -#endif // VDO_PAGE_CACHE_INTERNALS_H diff --git a/vdo/vdoRecovery.h b/vdo/vdoRecovery.h deleted file mode 100644 index 6cf80454..00000000 --- a/vdo/vdoRecovery.h +++ /dev/null @@ -1,52 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/vdoRecovery.h#7 $ - */ - -#ifndef VDO_RECOVERY_H -#define VDO_RECOVERY_H - -#include "completion.h" -#include "vdo.h" - -/** - * Replay recovery journal entries in the the slab journals of slabs owned by a - * given block_allocator. - * - * @param allocator The allocator whose slab journals are to be recovered - * @param completion The completion to use for waiting on slab journal space - * @param context The slab depot load context supplied by a recovery when - * it loads the depot - **/ -void vdo_replay_into_slab_journals(struct block_allocator *allocator, - struct vdo_completion *completion, - void *context); - -/** - * Construct a recovery completion and launch it. Apply all valid journal block - * entries to all vdo structures. This function performs the offline portion of - * recovering a vdo from a crash. - * - * @param vdo The vdo to recover - * @param parent The completion to notify when the offline portion of the - * recovery is complete - **/ -void vdo_launch_recovery(struct vdo *vdo, struct vdo_completion *parent); - -#endif // VDO_RECOVERY_H diff --git a/vdo/vdoRecoveryInternals.h b/vdo/vdoRecoveryInternals.h deleted file mode 100644 index 5df771ad..00000000 --- a/vdo/vdoRecoveryInternals.h +++ /dev/null @@ -1,140 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/vdoRecoveryInternals.h#5 $ - */ - -#ifndef VDO_RECOVERY_INTERNALS_H -#define VDO_RECOVERY_INTERNALS_H - -#include "vdoRecovery.h" - -#include "blockMapRecovery.h" -#include "intMap.h" -#include "journalPoint.h" -#include "types.h" -#include "waitQueue.h" - -/** - * The absolute position of an entry in the recovery journal, including - * the sector number and the entry number within the sector. - **/ -struct recovery_point { - sequence_number_t sequence_number; // Block sequence number - uint8_t sector_count; // Sector number - journal_entry_count_t entry_count; // Entry number -}; - -struct recovery_completion { - /** The completion header */ - struct vdo_completion completion; - /** The sub-task completion */ - struct vdo_completion sub_task_completion; - /** The struct vdo in question */ - struct vdo *vdo; - /** The struct block_allocator whose journals are being recovered */ - struct block_allocator *allocator; - /** A buffer to hold the data read off disk */ - char *journal_data; - /** The number of increfs */ - size_t incref_count; - - /** The entry data for the block map recovery */ - struct numbered_block_mapping *entries; - /** The number of entries in the entry array */ - size_t entry_count; - /** - * The sequence number of the first valid block for block map recovery - */ - sequence_number_t block_map_head; - /** - * The sequence number of the first valid block for slab journal replay - */ - sequence_number_t slab_journal_head; - /** - * The sequence number of the last valid block of the journal (if - * known) - */ - sequence_number_t tail; - /** - * The highest sequence number of the journal, not the same as the tail, - * since the tail ignores blocks after the first hole. - */ - sequence_number_t highest_tail; - - /** A location just beyond the last valid entry of the journal */ - struct recovery_point tail_recovery_point; - /** The location of the next recovery journal entry to apply */ - struct recovery_point next_recovery_point; - /** The number of logical blocks currently known to be in use */ - block_count_t logical_blocks_used; - /** The number of block map data blocks known to be allocated */ - block_count_t block_map_data_blocks; - /** The journal point to give to the next synthesized decref */ - struct journal_point next_journal_point; - /** The number of entries played into slab journals */ - size_t entries_added_to_slab_journals; - - // Decref synthesis fields - - /** An int_map for use in finding which slots are missing decrefs */ - struct int_map *slot_entry_map; - /** The number of synthesized decrefs */ - size_t missing_decref_count; - /** The number of incomplete decrefs */ - size_t incomplete_decref_count; - /** The fake journal point of the next missing decref */ - struct journal_point next_synthesized_journal_point; - /** The queue of missing decrefs */ - struct wait_queue missing_decrefs[]; -}; - -/** - * Convert a generic completion to a recovery_completion. - * - * @param completion The completion to convert - * - * @return The recovery_completion - **/ -static inline struct recovery_completion * __must_check -as_vdo_recovery_completion(struct vdo_completion *completion) -{ - assert_vdo_completion_type(completion->type, VDO_RECOVERY_COMPLETION); - return container_of(completion, struct recovery_completion, completion); -} - -/** - * Allocate and initialize a recovery_completion. - * - * @param vdo The vdo in question - * @param recovery_ptr A pointer to hold the new recovery_completion - * - * @return VDO_SUCCESS or a status code - **/ -int __must_check -make_vdo_recovery_completion(struct vdo *vdo, - struct recovery_completion **recovery_ptr); - -/** - * Free a recovery_completion and all underlying structures. - * - * @param recovery The recovery completion to free - **/ -void free_vdo_recovery_completion(struct recovery_completion *recovery); - -#endif // VDO_RECOVERY_INTERNALS_H diff --git a/vdo/vdoResize.c b/vdo/vdoResize.c deleted file mode 100644 index dc6f3958..00000000 --- a/vdo/vdoResize.c +++ /dev/null @@ -1,282 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/vdoResize.c#32 $ - */ - -#include "vdoResize.h" - -#include "logger.h" - -#include "adminCompletion.h" -#include "completion.h" -#include "recoveryJournal.h" -#include "slabDepot.h" -#include "slabSummary.h" -#include "vdoInternal.h" -#include "vdoLayout.h" - -enum { - GROW_PHYSICAL_PHASE_START = 0, - GROW_PHYSICAL_PHASE_COPY_SUMMARY, - GROW_PHYSICAL_PHASE_UPDATE_COMPONENTS, - GROW_PHYSICAL_PHASE_USE_NEW_SLABS, - GROW_PHYSICAL_PHASE_END, - GROW_PHYSICAL_PHASE_ERROR, -}; - -static const char *GROW_PHYSICAL_PHASE_NAMES[] = { - "GROW_PHYSICAL_PHASE_START", - "GROW_PHYSICAL_PHASE_COPY_SUMMARY", - "GROW_PHYSICAL_PHASE_UPDATE_COMPONENTS", - "GROW_PHYSICAL_PHASE_USE_NEW_SLABS", - "GROW_PHYSICAL_PHASE_END", - "GROW_PHYSICAL_PHASE_ERROR", -}; - -/** - * Implements vdo_thread_id_getter_for_phase. - **/ -static thread_id_t __must_check -get_thread_id_for_phase(struct admin_completion *admin_completion) -{ - return admin_completion->vdo->thread_config->admin_thread; -} - -/** - * Callback to initiate a grow physical, registered in - * perform_vdo_grow_physical(). - * - * @param completion The sub-task completion - **/ -static void grow_physical_callback(struct vdo_completion *completion) -{ - struct admin_completion *admin_completion = - vdo_admin_completion_from_sub_task(completion); - struct vdo *vdo = admin_completion->vdo; - - assert_vdo_admin_operation_type(admin_completion, - VDO_ADMIN_OPERATION_GROW_PHYSICAL); - assert_vdo_admin_phase_thread(admin_completion, __func__, - GROW_PHYSICAL_PHASE_NAMES); - - switch (admin_completion->phase++) { - case GROW_PHYSICAL_PHASE_START: - if (vdo_is_read_only(vdo->read_only_notifier)) { - uds_log_error_strerror(VDO_READ_ONLY, - "Can't grow physical size of a read-only VDO"); - set_vdo_completion_result(reset_vdo_admin_sub_task(completion), - VDO_READ_ONLY); - break; - } - - if (start_vdo_operation_with_waiter(&vdo->admin_state, - VDO_ADMIN_STATE_SUSPENDED_OPERATION, - &admin_completion->completion, - NULL)) { - // Copy the journal into the new layout. - copy_vdo_layout_partition(vdo->layout, - RECOVERY_JOURNAL_PARTITION, - reset_vdo_admin_sub_task(completion)); - } - return; - - case GROW_PHYSICAL_PHASE_COPY_SUMMARY: - copy_vdo_layout_partition(vdo->layout, - SLAB_SUMMARY_PARTITION, - reset_vdo_admin_sub_task(completion)); - return; - - case GROW_PHYSICAL_PHASE_UPDATE_COMPONENTS: - vdo->states.vdo.config.physical_blocks = - grow_vdo_layout(vdo->layout); - update_vdo_slab_depot_size(vdo->depot); - save_vdo_components(vdo, reset_vdo_admin_sub_task(completion)); - return; - - case GROW_PHYSICAL_PHASE_USE_NEW_SLABS: - vdo_use_new_slabs(vdo->depot, reset_vdo_admin_sub_task(completion)); - return; - - case GROW_PHYSICAL_PHASE_END: - set_vdo_slab_summary_origin(get_vdo_slab_summary(vdo->depot), - get_vdo_partition(vdo->layout, - SLAB_SUMMARY_PARTITION)); - set_vdo_recovery_journal_partition(vdo->recovery_journal, - get_vdo_partition(vdo->layout, - RECOVERY_JOURNAL_PARTITION)); - break; - - case GROW_PHYSICAL_PHASE_ERROR: - vdo_enter_read_only_mode(vdo->read_only_notifier, - completion->result); - break; - - default: - set_vdo_completion_result(reset_vdo_admin_sub_task(completion), - UDS_BAD_STATE); - } - - finish_vdo_layout_growth(vdo->layout); - finish_vdo_operation(&vdo->admin_state, completion->result); -} - -/** - * Handle an error during the grow physical process. - * - * @param completion The sub-task completion - **/ -static void handle_growth_error(struct vdo_completion *completion) -{ - vdo_admin_completion_from_sub_task(completion)->phase = - GROW_PHYSICAL_PHASE_ERROR; - grow_physical_callback(completion); -} - -/**********************************************************************/ -int perform_vdo_grow_physical(struct vdo *vdo, block_count_t new_physical_blocks) -{ - int result; - block_count_t new_depot_size, prepared_depot_size; - - block_count_t old_physical_blocks = - vdo->states.vdo.config.physical_blocks; - - // Skip any noop grows. - if (old_physical_blocks == new_physical_blocks) { - return VDO_SUCCESS; - } - - if (new_physical_blocks != get_next_vdo_layout_size(vdo->layout)) { - /* - * Either the VDO isn't prepared to grow, or it was prepared to - * grow to a different size. Doing this check here relies on - * the fact that the call to this method is done under the - * dmsetup message lock. - */ - finish_vdo_layout_growth(vdo->layout); - vdo_abandon_new_slabs(vdo->depot); - return VDO_PARAMETER_MISMATCH; - } - - // Validate that we are prepared to grow appropriately. - new_depot_size = - vdo_get_next_block_allocator_partition_size(vdo->layout); - prepared_depot_size = get_vdo_slab_depot_new_size(vdo->depot); - if (prepared_depot_size != new_depot_size) { - return VDO_PARAMETER_MISMATCH; - } - - result = perform_vdo_admin_operation(vdo, - VDO_ADMIN_OPERATION_GROW_PHYSICAL, - get_thread_id_for_phase, - grow_physical_callback, - handle_growth_error); - if (result != VDO_SUCCESS) { - return result; - } - - uds_log_info("Physical block count was %llu, now %llu", - (unsigned long long) old_physical_blocks, - (unsigned long long) new_physical_blocks); - return VDO_SUCCESS; -} - -/** - * Callback to check that we're not in recovery mode, used in - * prepare_vdo_to_grow_physical(). - * - * @param completion The sub-task completion - **/ -static void check_may_grow_physical(struct vdo_completion *completion) -{ - struct admin_completion *admin_completion = - vdo_admin_completion_from_sub_task(completion); - struct vdo *vdo = admin_completion->vdo; - - assert_vdo_admin_operation_type(admin_completion, - VDO_ADMIN_OPERATION_PREPARE_GROW_PHYSICAL); - assert_on_admin_thread(vdo, __func__); - - reset_vdo_admin_sub_task(completion); - - // This check can only be done from a base code thread. - if (vdo_is_read_only(vdo->read_only_notifier)) { - finish_vdo_completion(completion->parent, VDO_READ_ONLY); - return; - } - - // This check should only be done from a base code thread. - if (in_recovery_mode(vdo)) { - finish_vdo_completion(completion->parent, VDO_RETRY_AFTER_REBUILD); - return; - } - - complete_vdo_completion(completion->parent); -} - -/**********************************************************************/ -int prepare_vdo_to_grow_physical(struct vdo *vdo, - block_count_t new_physical_blocks) -{ - int result; - block_count_t new_depot_size; - - block_count_t current_physical_blocks = - vdo->states.vdo.config.physical_blocks; - if (new_physical_blocks < current_physical_blocks) { - return uds_log_error_strerror(VDO_NOT_IMPLEMENTED, - "Removing physical storage from a VDO is not supported"); - } - - if (new_physical_blocks == current_physical_blocks) { - uds_log_warning("Requested physical block count %llu not greater than %llu", - (unsigned long long) new_physical_blocks, - (unsigned long long) current_physical_blocks); - finish_vdo_layout_growth(vdo->layout); - vdo_abandon_new_slabs(vdo->depot); - return VDO_PARAMETER_MISMATCH; - } - - result = perform_vdo_admin_operation(vdo, - VDO_ADMIN_OPERATION_PREPARE_GROW_PHYSICAL, - get_thread_id_for_phase, - check_may_grow_physical, - finish_vdo_completion_parent_callback); - if (result != VDO_SUCCESS) { - return result; - } - - result = prepare_to_grow_vdo_layout(vdo->layout, - current_physical_blocks, - new_physical_blocks, - vdo); - if (result != VDO_SUCCESS) { - return result; - } - - new_depot_size = - vdo_get_next_block_allocator_partition_size(vdo->layout); - result = vdo_prepare_to_grow_slab_depot(vdo->depot, new_depot_size); - if (result != VDO_SUCCESS) { - finish_vdo_layout_growth(vdo->layout); - return result; - } - - return VDO_SUCCESS; -} diff --git a/vdo/vdoResize.h b/vdo/vdoResize.h deleted file mode 100644 index a8cdadea..00000000 --- a/vdo/vdoResize.h +++ /dev/null @@ -1,63 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/vdoResize.h#8 $ - */ - -#ifndef VDO_RESIZE_H -#define VDO_RESIZE_H - -#include "types.h" - -/** - * Make the completion for an asynchronous resize. - * - * @param vdo The vdo - * @param new_physical_blocks The new physical size in blocks - * @param completion_ptr A pointer to hold the completion - * - * @return VDO_SUCCESS or an error - **/ -int __must_check -make_resize_vdo_completion(struct vdo *vdo, - block_count_t new_physical_blocks, - struct vdo_completion **completion_ptr); - -/** - * Grow the physical size of the vdo. This method may only be called when the - * vdo has been suspended and must not be called from a base thread. - * - * @param vdo The vdo to resize - * @param new_physical_blocks The new physical size in blocks - * - * @return VDO_SUCCESS or an error - **/ -int perform_vdo_grow_physical(struct vdo *vdo, - block_count_t new_physical_blocks); - -/** - * Prepare to resize the vdo, allocating memory as needed. - * - * @param vdo The vdo - * @param new_physical_blocks The new physical size in blocks - **/ -int __must_check -prepare_vdo_to_grow_physical(struct vdo *vdo, - block_count_t new_physical_blocks); - -#endif /* VDO_RESIZE_H */ diff --git a/vdo/vdoResizeLogical.c b/vdo/vdoResizeLogical.c deleted file mode 100644 index 72cca342..00000000 --- a/vdo/vdoResizeLogical.c +++ /dev/null @@ -1,168 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/vdoResizeLogical.c#29 $ - */ - -#include "vdoResizeLogical.h" - -#include "logger.h" - -#include "adminCompletion.h" -#include "blockMap.h" -#include "completion.h" -#include "vdoInternal.h" - -enum { - GROW_LOGICAL_PHASE_START = 0, - GROW_LOGICAL_PHASE_GROW_BLOCK_MAP, - GROW_LOGICAL_PHASE_END, - GROW_LOGICAL_PHASE_ERROR, -}; - -static const char *GROW_LOGICAL_PHASE_NAMES[] = { - "GROW_LOGICAL_PHASE_START", - "GROW_LOGICAL_PHASE_GROW_BLOCK_MAP", - "GROW_LOGICAL_PHASE_END", - "GROW_LOGICAL_PHASE_ERROR", -}; - -/** - * Implements vdo_thread_id_getter_for_phase. - **/ -static thread_id_t __must_check -get_thread_id_for_phase(struct admin_completion *admin_completion) -{ - return admin_completion->vdo->thread_config->admin_thread; -} - -/** - * Callback to initiate a grow logical, registered in - * perform_vdo_grow_logical(). - * - * @param completion The sub-task completion - **/ -static void grow_logical_callback(struct vdo_completion *completion) -{ - struct admin_completion *admin_completion = - vdo_admin_completion_from_sub_task(completion); - struct vdo *vdo = admin_completion->vdo; - - assert_vdo_admin_operation_type(admin_completion, - VDO_ADMIN_OPERATION_GROW_LOGICAL); - assert_vdo_admin_phase_thread(admin_completion, __func__, - GROW_LOGICAL_PHASE_NAMES); - - switch (admin_completion->phase++) { - case GROW_LOGICAL_PHASE_START: - if (vdo_is_read_only(vdo->read_only_notifier)) { - uds_log_error_strerror(VDO_READ_ONLY, - "Can't grow logical size of a read-only VDO"); - finish_vdo_completion(reset_vdo_admin_sub_task(completion), - VDO_READ_ONLY); - return; - } - - if (start_vdo_operation_with_waiter(&vdo->admin_state, - VDO_ADMIN_STATE_SUSPENDED_OPERATION, - &admin_completion->completion, - NULL)) { - vdo->states.vdo.config.logical_blocks = - vdo_get_new_entry_count(get_block_map(vdo)); - save_vdo_components(vdo, - reset_vdo_admin_sub_task(completion)); - } - - return; - - case GROW_LOGICAL_PHASE_GROW_BLOCK_MAP: - grow_vdo_block_map(get_block_map(vdo), - reset_vdo_admin_sub_task(completion)); - return; - - case GROW_LOGICAL_PHASE_END: - break; - - case GROW_LOGICAL_PHASE_ERROR: - vdo_enter_read_only_mode(vdo->read_only_notifier, - completion->result); - break; - - default: - set_vdo_completion_result(reset_vdo_admin_sub_task(completion), - UDS_BAD_STATE); - } - - finish_vdo_operation(&vdo->admin_state, completion->result); -} - -/** - * Handle an error during the grow physical process. - * - * @param completion The sub-task completion - **/ -static void handle_growth_error(struct vdo_completion *completion) -{ - struct admin_completion *admin_completion = - vdo_admin_completion_from_sub_task(completion); - if (admin_completion->phase == GROW_LOGICAL_PHASE_GROW_BLOCK_MAP) { - // We've failed to write the new size in the super block, so set - // our in memory config back to the old size. - struct vdo *vdo = admin_completion->vdo; - struct block_map *map = get_block_map(vdo); - vdo->states.vdo.config.logical_blocks = - vdo_get_number_of_block_map_entries(map); - vdo_abandon_block_map_growth(map); - } - - admin_completion->phase = GROW_LOGICAL_PHASE_ERROR; - grow_logical_callback(completion); -} - -/**********************************************************************/ -int perform_vdo_grow_logical(struct vdo *vdo, block_count_t new_logical_blocks) -{ - if (vdo_get_new_entry_count(get_block_map(vdo)) != new_logical_blocks) { - return VDO_PARAMETER_MISMATCH; - } - - return perform_vdo_admin_operation(vdo, - VDO_ADMIN_OPERATION_GROW_LOGICAL, - get_thread_id_for_phase, - grow_logical_callback, - handle_growth_error); -} - -/**********************************************************************/ -int prepare_vdo_to_grow_logical(struct vdo *vdo, block_count_t new_logical_blocks) -{ - const char *message; - block_count_t logical_blocks = vdo->states.vdo.config.logical_blocks; - if (new_logical_blocks > logical_blocks) { - return vdo_prepare_to_grow_block_map(get_block_map(vdo), - new_logical_blocks); - } - - message = ((new_logical_blocks < logical_blocks) - ? "Can't shrink VDO logical size from its current value of " - : "Can't grow VDO logical size to its current value of "); - return uds_log_error_strerror(VDO_PARAMETER_MISMATCH, - "%s%llu", - message, - (unsigned long long) logical_blocks); -} diff --git a/vdo/vdoResizeLogical.h b/vdo/vdoResizeLogical.h deleted file mode 100644 index 796bd81f..00000000 --- a/vdo/vdoResizeLogical.h +++ /dev/null @@ -1,49 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/vdoResizeLogical.h#7 $ - */ - -#ifndef VDO_RESIZE_LOGICAL_H -#define VDO_RESIZE_LOGICAL_H - -#include "types.h" - -/** - * Grow the logical size of the vdo. This method may only be called when the - * vdo has been suspended and must not be called from a base thread. - * - * @param vdo The vdo to grow - * @param new_logical_blocks The size to which the vdo should be grown - * - * @return VDO_SUCCESS or an error - **/ -int perform_vdo_grow_logical(struct vdo *vdo, block_count_t new_logical_blocks); - -/** - * Prepare to grow the logical size of vdo. This method may only be called - * while the vdo is running. - * - * @param vdo The vdo to prepare for growth - * @param new_logical_blocks The size to which the vdo should be grown - * - * @return VDO_SUCCESS or an error - **/ -int prepare_vdo_to_grow_logical(struct vdo *vdo, block_count_t new_logical_blocks); - -#endif /* VDO_RESIZE_LOGICAL_H */ diff --git a/vdo/vdoResume.c b/vdo/vdoResume.c deleted file mode 100644 index 7b91c7b1..00000000 --- a/vdo/vdoResume.c +++ /dev/null @@ -1,203 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/vdoResume.c#33 $ - */ - -#include "vdoResume.h" - -#include "logger.h" - -#include "adminCompletion.h" -#include "blockMap.h" -#include "completion.h" -#include "logicalZone.h" -#include "recoveryJournal.h" -#include "slabDepot.h" -#include "slabSummary.h" -#include "threadConfig.h" -#include "vdoInternal.h" - -enum { - RESUME_PHASE_START = 0, - RESUME_PHASE_ALLOW_READ_ONLY_MODE, - RESUME_PHASE_DEPOT, - RESUME_PHASE_JOURNAL, - RESUME_PHASE_BLOCK_MAP, - RESUME_PHASE_LOGICAL_ZONES, - RESUME_PHASE_PACKER, - RESUME_PHASE_FLUSHER, - RESUME_PHASE_END, -}; - -static const char *RESUME_PHASE_NAMES[] = { - "RESUME_PHASE_START", - "RESUME_PHASE_ALLOW_READ_ONLY_MODE", - "RESUME_PHASE_DEPOT", - "RESUME_PHASE_JOURNAL", - "RESUME_PHASE_BLOCK_MAP", - "RESUME_PHASE_LOGICAL_ZONES", - "RESUME_PHASE_PACKER", - "RESUME_PHASE_FLUSHER", - "RESUME_PHASE_END", -}; - -/** - * Implements vdo_thread_id_getter_for_phase. - **/ -static thread_id_t __must_check -get_thread_id_for_phase(struct admin_completion *admin_completion) -{ - const struct thread_config *thread_config = - get_vdo_thread_config(admin_completion->vdo); - switch (admin_completion->phase) { - case RESUME_PHASE_JOURNAL: - return thread_config->journal_thread; - - case RESUME_PHASE_PACKER: - case RESUME_PHASE_FLUSHER: - return thread_config->packer_thread; - - default: - return thread_config->admin_thread; - } -} - -/** - * Update the VDO state and save the super block. - * - * @param vdo The vdo being resumed - * @param completion The admin_completion's sub-task completion - **/ -static void write_super_block(struct vdo *vdo, - struct vdo_completion *completion) -{ - switch (get_vdo_state(vdo)) { - case VDO_CLEAN: - case VDO_NEW: - set_vdo_state(vdo, VDO_DIRTY); - save_vdo_components(vdo, completion); - return; - - case VDO_DIRTY: - case VDO_READ_ONLY_MODE: - case VDO_FORCE_REBUILD: - case VDO_RECOVERING: - case VDO_REBUILD_FOR_UPGRADE: - // No need to write the super block in these cases - complete_vdo_completion(completion); - return; - - case VDO_REPLAYING: - default: - finish_vdo_completion(completion, UDS_BAD_STATE); - } -} - -/** - * Callback to resume a VDO. - * - * @param completion The sub-task completion - **/ -static void resume_callback(struct vdo_completion *completion) -{ - struct admin_completion *admin_completion = - vdo_admin_completion_from_sub_task(completion); - struct vdo *vdo = admin_completion->vdo; - assert_vdo_admin_operation_type(admin_completion, - VDO_ADMIN_OPERATION_RESUME); - assert_vdo_admin_phase_thread(admin_completion, __func__, - RESUME_PHASE_NAMES); - - switch (admin_completion->phase++) { - case RESUME_PHASE_START: - if (start_vdo_resuming(&vdo->admin_state, - VDO_ADMIN_STATE_RESUMING, - &admin_completion->completion, - NULL)) { - write_super_block(vdo, completion); - } - return; - - case RESUME_PHASE_ALLOW_READ_ONLY_MODE: - vdo_allow_read_only_mode_entry(vdo->read_only_notifier, - reset_vdo_admin_sub_task(completion)); - return; - - case RESUME_PHASE_DEPOT: - resume_vdo_slab_depot(vdo->depot, reset_vdo_admin_sub_task(completion)); - return; - - case RESUME_PHASE_JOURNAL: - resume_vdo_recovery_journal(vdo->recovery_journal, - reset_vdo_admin_sub_task(completion)); - return; - - case RESUME_PHASE_BLOCK_MAP: - resume_vdo_block_map(vdo->block_map, - reset_vdo_admin_sub_task(completion)); - return; - - case RESUME_PHASE_LOGICAL_ZONES: - resume_vdo_logical_zones(vdo->logical_zones, - reset_vdo_admin_sub_task(completion)); - return; - - case RESUME_PHASE_PACKER: - { - bool was_enabled = get_vdo_compressing(vdo); - bool enable = vdo->device_config->compression; - - if (enable != was_enabled) { - WRITE_ONCE(vdo->compressing, enable); - } - uds_log_info("compression is %s", - (enable ? "enabled" : "disabled")); - - resume_vdo_packer(vdo->packer, - reset_vdo_admin_sub_task(completion)); - return; - } - case RESUME_PHASE_FLUSHER: - vdo_resume_flusher(vdo->flusher, - reset_vdo_admin_sub_task(completion)); - return; - - case RESUME_PHASE_END: - break; - - default: - set_vdo_completion_result(reset_vdo_admin_sub_task(completion), - UDS_BAD_STATE); - } - - finish_vdo_resuming_with_result(&vdo->admin_state, completion->result); -} - -/**********************************************************************/ -int perform_vdo_resume(struct vdo *vdo) -{ - int result = perform_vdo_admin_operation(vdo, - VDO_ADMIN_OPERATION_RESUME, - get_thread_id_for_phase, - resume_callback, - preserve_vdo_completion_error_and_continue); - - /* Even if the vdo is read-only, it has still resumed. */ - return ((result == VDO_READ_ONLY) ? VDO_SUCCESS : result); -} diff --git a/vdo/vdoResume.h b/vdo/vdoResume.h deleted file mode 100644 index 41a609cd..00000000 --- a/vdo/vdoResume.h +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/vdoResume.h#7 $ - */ - -#ifndef VDO_RESUME_H -#define VDO_RESUME_H - -#include "types.h" - -/** - * Resume a suspended vdo. - * - * @param vdo The vdo to resume - * - * @return VDO_SUCCESS or an error - **/ -int perform_vdo_resume(struct vdo *vdo); - -#endif /* VDO_RESUME_H */ diff --git a/vdo/vdoState.c b/vdo/vdoState.c deleted file mode 100644 index 54b7e9f1..00000000 --- a/vdo/vdoState.c +++ /dev/null @@ -1,69 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/vdoState.c#9 $ - */ - -#include "vdoState.h" - -#include "permassert.h" - -static const char *VDO_STATE_NAMES[] = { - [VDO_CLEAN] = "CLEAN", - [VDO_DIRTY] = "DIRTY", - [VDO_FORCE_REBUILD] = "FORCE_REBUILD", - [VDO_NEW] = "NEW", - [VDO_READ_ONLY_MODE] = "READ_ONLY_MODE", - [VDO_REBUILD_FOR_UPGRADE] = "REBUILD_FOR_UPGRADE", - [VDO_RECOVERING] = "RECOVERING", - [VDO_REPLAYING] = "REPLAYING", -}; - -/**********************************************************************/ -const char *get_vdo_state_name(enum vdo_state state) -{ - int result; - - // Catch if a state has been added without updating the name array. - STATIC_ASSERT(COUNT_OF(VDO_STATE_NAMES) == VDO_STATE_COUNT); - - result = ASSERT(state < COUNT_OF(VDO_STATE_NAMES), - "vdo_state value %u must have a registered name", - state); - if (result != UDS_SUCCESS) { - return "INVALID VDO STATE CODE"; - } - - return VDO_STATE_NAMES[state]; -} - -/**********************************************************************/ -const char *describe_vdo_state(enum vdo_state state) -{ - // These strings should all fit in the 15 chars of VDOStatistics.mode. - switch (state) { - case VDO_RECOVERING: - return "recovering"; - - case VDO_READ_ONLY_MODE: - return "read-only"; - - default: - return "normal"; - } -} diff --git a/vdo/vdoState.h b/vdo/vdoState.h deleted file mode 100644 index 5121d12b..00000000 --- a/vdo/vdoState.h +++ /dev/null @@ -1,63 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/vdoState.h#8 $ - */ - -#ifndef VDO_STATE_H -#define VDO_STATE_H - -#include "compiler.h" - -/** - * The current operating mode of the VDO. These are persistent on disk - * so the values must not change. - **/ -enum vdo_state { - VDO_DIRTY = 0, - VDO_NEW = 1, - VDO_CLEAN = 2, - VDO_READ_ONLY_MODE = 3, - VDO_FORCE_REBUILD = 4, - VDO_RECOVERING = 5, - VDO_REPLAYING = 6, - VDO_REBUILD_FOR_UPGRADE = 7, - - // Keep VDO_STATE_COUNT at the bottom. - VDO_STATE_COUNT -}; - -/** - * Get the name of a VDO state code for logging purposes. - * - * @param state The state code - * - * @return The name of the state code - **/ -const char * __must_check get_vdo_state_name(enum vdo_state state); - -/** - * Return a user-visible string describing the current VDO state. - * - * @param state The VDO state to describe - * - * @return A string constant describing the state - **/ -const char * __must_check describe_vdo_state(enum vdo_state state); - -#endif // VDO_STATE_H diff --git a/vdo/vdoStringUtils.c b/vdo/vdoStringUtils.c deleted file mode 100644 index c514a440..00000000 --- a/vdo/vdoStringUtils.c +++ /dev/null @@ -1,166 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/kernel/vdoStringUtils.c#14 $ - */ - -#include "vdoStringUtils.h" - -#include "errors.h" -#include "logger.h" -#include "memoryAlloc.h" -#include "stringUtils.h" - -#include "statusCodes.h" - -/**********************************************************************/ -void vdo_free_string_array(char **string_array) -{ - unsigned int offset; - - for (offset = 0; string_array[offset] != NULL; offset++) { - UDS_FREE(string_array[offset]); - } - UDS_FREE(string_array); -} - -/**********************************************************************/ -int vdo_split_string(const char *string, - char separator, - char ***substring_array_ptr) -{ - unsigned int current_substring = 0, substring_count = 1; - const char *s; - char **substrings; - int result; - ptrdiff_t length; - - for (s = string; *s != 0; s++) { - if (*s == separator) { - substring_count++; - } - } - - result = UDS_ALLOCATE(substring_count + 1, - char *, - "string-splitting array", - &substrings); - if (result != UDS_SUCCESS) { - return result; - } - - for (s = string; *s != 0; s++) { - if (*s == separator) { - ptrdiff_t length = s - string; - - result = UDS_ALLOCATE(length + 1, - char, - "split string", - &substrings[current_substring]); - if (result != UDS_SUCCESS) { - vdo_free_string_array(substrings); - return result; - } - // Trailing NUL is already in place after allocation; - // deal with the zero or more non-NUL bytes in the - // string. - if (length > 0) { - memcpy(substrings[current_substring], - string, - length); - } - string = s + 1; - current_substring++; - BUG_ON(current_substring >= substring_count); - } - } - // Process final string, with no trailing separator. - BUG_ON(current_substring != (substring_count - 1)); - length = strlen(string); - - result = UDS_ALLOCATE(length + 1, - char, - "split string", - &substrings[current_substring]); - if (result != UDS_SUCCESS) { - vdo_free_string_array(substrings); - return result; - } - memcpy(substrings[current_substring], string, length); - current_substring++; - // substrings[current_substring] is NULL already - *substring_array_ptr = substrings; - return UDS_SUCCESS; -} - -/**********************************************************************/ -int vdo_join_strings(char **substring_array, size_t array_length, - char separator, char **string_ptr) -{ - size_t string_length = 0; - size_t i; - int result; - char *output, *current_position; - - for (i = 0; (i < array_length) && (substring_array[i] != NULL); i++) { - string_length += strlen(substring_array[i]) + 1; - } - - result = UDS_ALLOCATE(string_length, char, __func__, &output); - - if (result != VDO_SUCCESS) { - return result; - } - - current_position = &output[0]; - - for (i = 0; (i < array_length) && (substring_array[i] != NULL); i++) { - current_position = uds_append_to_buffer(current_position, - output + string_length, - "%s", - substring_array[i]); - *current_position = separator; - current_position++; - } - - // We output one too many separators; replace the last with a zero byte. - if (current_position != output) { - *(current_position - 1) = '\0'; - } - - *string_ptr = output; - return UDS_SUCCESS; -} - -/**********************************************************************/ -int vdo_string_to_uint(const char *input, unsigned int *value_ptr) -{ - unsigned long long_value; - int result = kstrtoul(input, 10, &long_value); - - if (result != 0) { - return result; - } - - if (long_value > UINT_MAX) { - return -ERANGE; - } - - *value_ptr = long_value; - return UDS_SUCCESS; -} diff --git a/vdo/vdoStringUtils.h b/vdo/vdoStringUtils.h deleted file mode 100644 index 673a6d68..00000000 --- a/vdo/vdoStringUtils.h +++ /dev/null @@ -1,88 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/kernel/vdoStringUtils.h#12 $ - */ - -#ifndef VDO_STRING_UTILS_H -#define VDO_STRING_UTILS_H - -#include - -/** - * Split the input string into substrings, separated at occurrences of - * the indicated character, returning a null-terminated list of string - * pointers. - * - * The string pointers and the pointer array itself should both be - * freed with UDS_FREE() when no longer needed. This can be done with - * vdo_free_string_array (below) if the pointers in the array are not - * changed. Since the array and copied strings are allocated by this - * function, it may only be used in contexts where allocation is - * permitted. - * - * Empty substrings are not ignored; that is, returned substrings may - * be empty strings if the separator occurs twice in a row. - * - * @param [in] string The input string to be broken apart - * @param [in] separator The separator character - * @param [out] substring_array_ptr The NULL-terminated substring array - * - * @return UDS_SUCCESS or -ENOMEM - **/ -int __must_check -vdo_split_string(const char *string, char separator, char ***substring_array_ptr); - -/** - * Join the input substrings into one string, joined with the indicated - * character, returning a string. - * - * @param [in] substring_array The NULL-terminated substring array - * @param [in] array_length A bound on the number of valid elements - * in substring_array, in case it is not - * NULL-terminated. - * @param [in] separator The separator character - * @param [out] string_ptr A pointer to hold the joined string - * - * @return VDO_SUCCESS or an error - **/ -int __must_check vdo_join_strings(char **substring_array, - size_t array_length, - char separator, - char **string_ptr); - -/** - * Free a list of non-NULL string pointers, and then the list itself. - * - * @param string_array The string list - **/ -void vdo_free_string_array(char **string_array); - -/** - * Parse a string as an "unsigned int" value, yielding the value. - * On overflow, -ERANGE is returned. On invalid number, -EINVAL is - * returned. - * - * @param [in] input The string to be processed - * @param [out] value_ptr The value of the number read - * - * @return UDS_SUCCESS or -EINVAL or -ERANGE. - **/ -int __must_check vdo_string_to_uint(const char *input, unsigned int *value_ptr); - -#endif /* VDO_STRING_UTILS_H */ diff --git a/vdo/vdoSuspend.c b/vdo/vdoSuspend.c deleted file mode 100644 index 8597a512..00000000 --- a/vdo/vdoSuspend.c +++ /dev/null @@ -1,259 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/vdoSuspend.c#33 $ - */ - -#include "vdoSuspend.h" - -#include "logger.h" -#include "permassert.h" - -#include "adminCompletion.h" -#include "blockMap.h" -#include "completion.h" -#include "dedupeIndex.h" -#include "logicalZone.h" -#include "recoveryJournal.h" -#include "slabDepot.h" -#include "slabSummary.h" -#include "threadConfig.h" -#include "vdoInternal.h" - -enum { - SUSPEND_PHASE_START = 0, - SUSPEND_PHASE_PACKER, - SUSPEND_PHASE_DATA_VIOS, - SUSPEND_PHASE_FLUSHES, - SUSPEND_PHASE_LOGICAL_ZONES, - SUSPEND_PHASE_BLOCK_MAP, - SUSPEND_PHASE_JOURNAL, - SUSPEND_PHASE_DEPOT, - SUSPEND_PHASE_READ_ONLY_WAIT, - SUSPEND_PHASE_WRITE_SUPER_BLOCK, - SUSPEND_PHASE_END, -}; - -static const char *SUSPEND_PHASE_NAMES[] = { - "SUSPEND_PHASE_START", - "SUSPEND_PHASE_PACKER", - "SUSPEND_PHASE_DATA_VIOS", - "SUSPEND_PHASE_FLUSHES", - "SUSPEND_PHASE_LOGICAL_ZONES", - "SUSPEND_PHASE_BLOCK_MAP", - "SUSPEND_PHASE_JOURNAL", - "SUSPEND_PHASE_DEPOT", - "SUSPEND_PHASE_READ_ONLY_WAIT", - "SUSPEND_PHASE_WRITE_SUPER_BLOCK", - "SUSPEND_PHASE_END", -}; - -/** - * Implements vdo_thread_id_getter_for_phase. - **/ -static thread_id_t __must_check -get_thread_id_for_phase(struct admin_completion *admin_completion) -{ - const struct thread_config *thread_config = - get_vdo_thread_config(admin_completion->vdo); - switch (admin_completion->phase) { - case SUSPEND_PHASE_PACKER: - case SUSPEND_PHASE_FLUSHES: - return thread_config->packer_thread; - - case SUSPEND_PHASE_JOURNAL: - return thread_config->journal_thread; - - default: - return thread_config->admin_thread; - } -} - -/** - * Update the VDO state and save the super block. - * - * @param vdo The vdo being suspended - * @param completion The admin_completion's sub-task completion - **/ -static void write_super_block(struct vdo *vdo, - struct vdo_completion *completion) -{ - switch (get_vdo_state(vdo)) { - case VDO_DIRTY: - case VDO_NEW: - set_vdo_state(vdo, VDO_CLEAN); - break; - - case VDO_CLEAN: - case VDO_READ_ONLY_MODE: - case VDO_FORCE_REBUILD: - case VDO_RECOVERING: - case VDO_REBUILD_FOR_UPGRADE: - break; - - case VDO_REPLAYING: - default: - finish_vdo_completion(completion, UDS_BAD_STATE); - return; - } - - save_vdo_components(vdo, completion); -} - -/** - * Callback to initiate a suspend, registered in suspend_vdo(). - * - * @param completion The sub-task completion - **/ -static void suspend_callback(struct vdo_completion *completion) -{ - struct admin_completion *admin_completion = - vdo_admin_completion_from_sub_task(completion); - struct vdo *vdo = admin_completion->vdo; - struct admin_state *admin_state = &vdo->admin_state; - int result; - - assert_vdo_admin_operation_type(admin_completion, - VDO_ADMIN_OPERATION_SUSPEND); - assert_vdo_admin_phase_thread(admin_completion, __func__, - SUSPEND_PHASE_NAMES); - - switch (admin_completion->phase++) { - case SUSPEND_PHASE_START: - if (start_vdo_draining(admin_state, - (vdo->no_flush_suspend - ? VDO_ADMIN_STATE_SUSPENDING - : VDO_ADMIN_STATE_SAVING), - &admin_completion->completion, - NULL)) { - complete_vdo_completion(reset_vdo_admin_sub_task(completion)); - } - return; - - case SUSPEND_PHASE_PACKER: - /* - * If the VDO was already resumed from a prior suspend while - * read-only, some of the components may not have been resumed. - * By setting a read-only error here, we guarantee that the - * result of this suspend will be VDO_READ_ONLY and not - * VDO_INVALID_ADMIN_STATE in that case. - */ - if (in_read_only_mode(vdo)) { - set_vdo_completion_result(&admin_completion->completion, - VDO_READ_ONLY); - } - - drain_vdo_packer(vdo->packer, - reset_vdo_admin_sub_task(completion)); - return; - - case SUSPEND_PHASE_DATA_VIOS: - drain_vdo_limiter(&vdo->request_limiter, - reset_vdo_admin_sub_task(completion)); - return; - - case SUSPEND_PHASE_FLUSHES: - vdo_drain_flusher(vdo->flusher, - reset_vdo_admin_sub_task(completion)); - return; - - case SUSPEND_PHASE_LOGICAL_ZONES: - /* - * Attempt to flush all I/O before completing post suspend - * work. We believe a suspended device is expected to have - * persisted all data written before the suspend, even if it - * hasn't been flushed yet. - */ - result = vdo_synchronous_flush(vdo); - if (result != VDO_SUCCESS) { - vdo_enter_read_only_mode(vdo->read_only_notifier, - result); - } - - drain_vdo_logical_zones(vdo->logical_zones, - get_vdo_admin_state_code(admin_state), - reset_vdo_admin_sub_task(completion)); - return; - - case SUSPEND_PHASE_BLOCK_MAP: - drain_vdo_block_map(vdo->block_map, - get_vdo_admin_state_code(admin_state), - reset_vdo_admin_sub_task(completion)); - return; - - case SUSPEND_PHASE_JOURNAL: - drain_vdo_recovery_journal(vdo->recovery_journal, - get_vdo_admin_state_code(admin_state), - reset_vdo_admin_sub_task(completion)); - return; - - case SUSPEND_PHASE_DEPOT: - drain_vdo_slab_depot(vdo->depot, - get_vdo_admin_state_code(admin_state), - reset_vdo_admin_sub_task(completion)); - return; - - case SUSPEND_PHASE_READ_ONLY_WAIT: - vdo_wait_until_not_entering_read_only_mode(vdo->read_only_notifier, - reset_vdo_admin_sub_task(completion)); - return; - - case SUSPEND_PHASE_WRITE_SUPER_BLOCK: - if (is_vdo_state_suspending(admin_state) || - (admin_completion->completion.result != VDO_SUCCESS)) { - // If we didn't save the VDO or there was an error, - // we're done. - break; - } - - write_super_block(vdo, reset_vdo_admin_sub_task(completion)); - return; - - case SUSPEND_PHASE_END: - suspend_vdo_dedupe_index(vdo->dedupe_index, - !vdo->no_flush_suspend); - break; - - default: - set_vdo_completion_result(completion, UDS_BAD_STATE); - } - - finish_vdo_draining_with_result(admin_state, completion->result); -} - -/**********************************************************************/ -int suspend_vdo(struct vdo *vdo) -{ - /* - * It's important to note any error here does not actually stop - * device-mapper from suspending the device. All this work is done - * post suspend. - */ - int result = perform_vdo_admin_operation(vdo, - VDO_ADMIN_OPERATION_SUSPEND, - get_thread_id_for_phase, - suspend_callback, - preserve_vdo_completion_error_and_continue); - - if ((result != VDO_SUCCESS) && (result != VDO_READ_ONLY)) { - uds_log_error_strerror(result, "%s: Suspend device failed", - __func__); - } - - return result; -} diff --git a/vdo/vdoSuspend.h b/vdo/vdoSuspend.h deleted file mode 100644 index 7a0254c9..00000000 --- a/vdo/vdoSuspend.h +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/vdoSuspend.h#7 $ - */ - -#ifndef VDO_SUSPEND_H -#define VDO_SUSPEND_H - -#include "types.h" - -/** - * Ensure that the vdo has no outstanding I/O and will issue none until it is - * resumed. - * - * @param vdo The vdo to suspend - * - * @return VDO_SUCCESS or an error - **/ -int suspend_vdo(struct vdo *vdo); - -#endif /* VDO_SUSPEND_H */ diff --git a/vdo/verify.c b/vdo/verify.c deleted file mode 100644 index 22ecd45b..00000000 --- a/vdo/verify.c +++ /dev/null @@ -1,152 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/kernel/verify.c#13 $ - */ - -#include "logger.h" -#include "permassert.h" - -#include "dataKVIO.h" -#include - -/** - * Compare blocks of memory for equality. - * - * This assumes the blocks are likely to be large; it's not well - * optimized for comparing just a few bytes. This is desirable - * because the Linux kernel memcmp() routine on x86 is not well - * optimized for large blocks, and the performance penalty turns out - * to be significant if you're doing lots of 4KB comparisons. - * - * @param pointer_argument1 first data block - * @param pointer_argument2 second data block - * @param length length of the data block - * - * @return true iff the two blocks are equal - **/ -static bool __must_check -memory_equal(void *pointer_argument1, void *pointer_argument2, size_t length) -{ - byte *pointer1 = pointer_argument1; - byte *pointer2 = pointer_argument2; - - while (length >= sizeof(uint64_t)) { - /* - * get_unaligned is just for paranoia. (1) On x86_64 it is - * treated the same as an aligned access. (2) In this use case, - * one or both of the inputs will almost(?) always be aligned. - */ - if (get_unaligned((u64 *) pointer1) != - get_unaligned((u64 *) pointer2)) { - return false; - } - pointer1 += sizeof(uint64_t); - pointer2 += sizeof(uint64_t); - length -= sizeof(uint64_t); - } - while (length > 0) { - if (*pointer1 != *pointer2) { - return false; - } - pointer1++; - pointer2++; - length--; - } - return true; -} - -/** - * Verify the deduplication advice from the UDS index, and invoke a - * callback once the answer is available. - * - * After we've compared the stored data with the data to be written, - * or after we've failed to be able to do so, the stored VIO callback - * is queued to be run in the main (kvdoReqQ) thread. - * - * If the advice turns out to be stale and the deduplication session - * is still active, submit a correction. (Currently the correction - * must be sent before the callback can be invoked, if the dedupe - * session is still live.) - * - * @param item The workitem from the queue - **/ -static void verify_duplication_work(struct vdo_work_item *item) -{ - struct data_vio *data_vio = work_item_as_data_vio(item); - - if (likely(memory_equal(data_vio->data_block, - data_vio->read_block.data, - VDO_BLOCK_SIZE))) { - // Leave data_vio->is_duplicate set to true. - } else { - data_vio->is_duplicate = false; - } - - enqueue_data_vio_callback(data_vio); -} - -/** - * Verify the deduplication advice from the UDS index, and invoke a - * callback once the answer is available. - * - * @param completion The data_vio that we are looking to dedupe. - **/ -static void verify_read_block_callback(struct vdo_completion *completion) -{ - struct data_vio *data_vio = as_data_vio(completion); - int err = data_vio->read_block.status; - - if (unlikely(err != 0)) { - uds_log_debug("%s: err %d", __func__, err); - data_vio->is_duplicate = false; - enqueue_data_vio_callback(data_vio); - return; - } - - launch_data_vio_on_cpu_queue(data_vio, - verify_duplication_work, - NULL, - CPU_Q_ACTION_COMPRESS_BLOCK); -} - -/**********************************************************************/ -void verify_data_vio_duplication(struct data_vio *data_vio) -{ - ASSERT_LOG_ONLY(data_vio->is_duplicate, - "advice to verify must be valid"); - ASSERT_LOG_ONLY(data_vio->duplicate.state != VDO_MAPPING_STATE_UNMAPPED, - "advice to verify must not be a discard"); - ASSERT_LOG_ONLY(data_vio->duplicate.pbn != VDO_ZERO_BLOCK, - "advice to verify must not point to the zero block"); - ASSERT_LOG_ONLY(!data_vio->is_zero_block, - "zeroed block should not have advice to verify"); - - vdo_read_block(data_vio, - data_vio->duplicate.pbn, - data_vio->duplicate.state, - BIO_Q_ACTION_VERIFY, - verify_read_block_callback); -} - -/**********************************************************************/ -bool compare_data_vios(struct data_vio *first, struct data_vio *second) -{ - return memory_equal(first->data_block, second->data_block, - VDO_BLOCK_SIZE); -} diff --git a/vdo/vioPool.c b/vdo/vio-pool.c similarity index 74% rename from vdo/vioPool.c rename to vdo/vio-pool.c index 17a15736..8139b7b6 100644 --- a/vdo/vioPool.c +++ b/vdo/vio-pool.c @@ -1,28 +1,12 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/vioPool.c#14 $ */ -#include "vioPool.h" +#include "vio-pool.h" #include "logger.h" -#include "memoryAlloc.h" +#include "memory-alloc.h" #include "permassert.h" #include "constants.h" @@ -30,9 +14,9 @@ #include "vio.h" #include "types.h" -/** +/* * An vio_pool is a collection of preallocated vios. - **/ + */ struct vio_pool { /** The number of objects managed by the pool */ size_t size; @@ -52,7 +36,17 @@ struct vio_pool { struct vio_pool_entry entries[]; }; -/**********************************************************************/ +/** + * make_vio_pool() - Create a new vio pool. + * @vdo: The vdo. + * @pool_size: The number of vios in the pool. + * @thread_id: The ID of the thread using this pool. + * @constructor: The constructor for vios in the pool. + * @context: The context that each entry will have. + * @pool_ptr: The resulting pool. + * + * Return: A success or error code. + */ int make_vio_pool(struct vdo *vdo, size_t pool_size, thread_id_t thread_id, @@ -85,6 +79,7 @@ int make_vio_pool(struct vdo *vdo, ptr = pool->buffer; for (i = 0; i < pool_size; i++) { struct vio_pool_entry *entry = &pool->entries[i]; + entry->buffer = ptr; entry->context = context; result = constructor(vdo, entry, ptr, &entry->vio); @@ -103,7 +98,10 @@ int make_vio_pool(struct vdo *vdo, return VDO_SUCCESS; } -/**********************************************************************/ +/** + * free_vio_pool() - Destroy a vio pool. + * @pool: The pool to free. + */ void free_vio_pool(struct vio_pool *pool) { struct vio_pool_entry *entry; @@ -113,7 +111,7 @@ void free_vio_pool(struct vio_pool *pool) return; } - // Remove all available entries from the object pool. + /* Remove all available entries from the object pool. */ ASSERT_LOG_ONLY(!has_waiters(&pool->waiting), "VIO pool must not have any waiters when being freed"); ASSERT_LOG_ONLY((pool->busy_count == 0), @@ -128,26 +126,37 @@ void free_vio_pool(struct vio_pool *pool) free_vio(UDS_FORGET(entry->vio)); } - // Make sure every vio_pool_entry has been removed. + /* Make sure every vio_pool_entry has been removed. */ for (i = 0; i < pool->size; i++) { entry = &pool->entries[i]; ASSERT_LOG_ONLY(list_empty(&entry->available_entry), "VIO Pool entry still in use: VIO is in use for physical block %llu for operation %u", (unsigned long long) entry->vio->physical, - entry->vio->operation); + entry->vio->bio->bi_opf); } UDS_FREE(UDS_FORGET(pool->buffer)); UDS_FREE(pool); } -/**********************************************************************/ +/** + * is_vio_pool_busy() - Check whether an vio pool has outstanding entries. + * + * Return: true if the pool is busy. + */ bool is_vio_pool_busy(struct vio_pool *pool) { return (pool->busy_count != 0); } -/**********************************************************************/ +/** + * acquire_vio_from_pool() - Acquire a vio and buffer from the pool + * (asynchronous). + * @pool: The vio pool. + * @waiter: Object that is requesting a vio. + * + * Return: VDO_SUCCESS or an error. + */ int acquire_vio_from_pool(struct vio_pool *pool, struct waiter *waiter) { struct list_head *entry; @@ -166,7 +175,11 @@ int acquire_vio_from_pool(struct vio_pool *pool, struct waiter *waiter) return VDO_SUCCESS; } -/**********************************************************************/ +/** + * return_vio_to_pool() - Return a vio and its buffer to the pool. + * @pool: The vio pool. + * @entry: A vio pool entry. + */ void return_vio_to_pool(struct vio_pool *pool, struct vio_pool_entry *entry) { ASSERT_LOG_ONLY((pool->thread_id == vdo_get_callback_thread_id()), @@ -179,4 +192,4 @@ void return_vio_to_pool(struct vio_pool *pool, struct vio_pool_entry *entry) list_move_tail(&entry->available_entry, &pool->available); --pool->busy_count; - } +} diff --git a/vdo/vio-pool.h b/vdo/vio-pool.h new file mode 100644 index 00000000..a2c05b27 --- /dev/null +++ b/vdo/vio-pool.h @@ -0,0 +1,72 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright Red Hat + */ + +#ifndef VIO_POOL_H +#define VIO_POOL_H + +#include + +#include "permassert.h" + +#include "completion.h" +#include "types.h" +#include "wait-queue.h" + +/* + * A vio_pool is a collection of preallocated vios used to write arbitrary + * metadata blocks. + */ + +/* + * A vio_pool_entry is the pair of vio and buffer whether in use or not. + */ +struct vio_pool_entry { + struct list_head available_entry; + struct vio *vio; + void *buffer; + void *parent; + void *context; +}; + +/** + * typedef vio_constructor - A function which constructs a vio for a pool. + * @vdo: The vdo in which the vio will operate. + * @parent: The parent of the vio. + * @buffer: The data buffer for the vio. + * @vio_ptr: A pointer to hold the new vio. + */ +typedef int vio_constructor(struct vdo *vdo, + void *parent, + void *buffer, + struct vio **vio_ptr); + +int __must_check make_vio_pool(struct vdo *vdo, + size_t pool_size, + thread_id_t thread_id, + vio_constructor *constructor, + void *context, + struct vio_pool **pool_ptr); + +void free_vio_pool(struct vio_pool *pool); + +bool __must_check is_vio_pool_busy(struct vio_pool *pool); + +int acquire_vio_from_pool(struct vio_pool *pool, struct waiter *waiter); + +void return_vio_to_pool(struct vio_pool *pool, struct vio_pool_entry *entry); + +/** + * as_vio_pool_entry() - Convert a list entry to the vio_pool_entry + * that contains it. + * @entry: The list entry to convert. + * + * Return: The vio_pool_entry wrapping the list entry. + */ +static inline struct vio_pool_entry *as_vio_pool_entry(struct list_head *entry) +{ + return list_entry(entry, struct vio_pool_entry, available_entry); +} + +#endif /* VIO_POOL_H */ diff --git a/vdo/vio-read.c b/vdo/vio-read.c new file mode 100644 index 00000000..9f275eb6 --- /dev/null +++ b/vdo/vio-read.c @@ -0,0 +1,308 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright Red Hat + */ + +#include "vio-read.h" + +#include +#include + +#include "logger.h" + +#include "bio.h" +#include "block-map.h" +#include "data-vio.h" +#include "io-submitter.h" +#include "kernel-types.h" +#include "vdo.h" +#include "vio-write.h" + +/** + * DOC: Bio flags. + * + * For certain flags set on user bios, if the user bio has not yet been + * acknowledged, setting those flags on our own bio(s) for that request may + * help underlying layers better fulfill the user bio's needs. This constant + * contains the aggregate of those flags; VDO strips all the other flags, as + * they convey incorrect information. + * + * These flags are always irrelevant if we have already finished the user bio + * as they are only hints on IO importance. If VDO has finished the user bio, + * any remaining IO done doesn't care how important finishing the finished bio + * was. + * + * Note that bio.c contains the complete list of flags we believe may be set; + * the following list explains the action taken with each of those flags VDO + * could receive: + * + * * REQ_SYNC: Passed down if the user bio is not yet completed, since it + * indicates the user bio completion is required for further work to be + * done by the issuer. + * * REQ_META: Passed down if the user bio is not yet completed, since it may + * mean the lower layer treats it as more urgent, similar to REQ_SYNC. + * * REQ_PRIO: Passed down if the user bio is not yet completed, since it + * indicates the user bio is important. + * * REQ_NOMERGE: Set only if the incoming bio was split; irrelevant to VDO IO. + * * REQ_IDLE: Set if the incoming bio had more IO quickly following; VDO's IO + * pattern doesn't match incoming IO, so this flag is incorrect for it. + * * REQ_FUA: Handled separately, and irrelevant to VDO IO otherwise. + * * REQ_RAHEAD: Passed down, as, for reads, it indicates trivial importance. + * * REQ_BACKGROUND: Not passed down, as VIOs are a limited resource and VDO + * needs them recycled ASAP to service heavy load, which is the only place + * where REQ_BACKGROUND might aid in load prioritization. + */ +static unsigned int PASSTHROUGH_FLAGS = + (REQ_PRIO | REQ_META | REQ_SYNC | REQ_RAHEAD); + +static void continue_partial_write(struct vdo_completion *completion) +{ + struct data_vio *data_vio = as_data_vio(completion); + + assert_data_vio_in_logical_zone(data_vio); + + launch_write_data_vio(data_vio); +} + +/** + * modify_for_partial_write() - Do the modify-write part of a + * read-modify-write cycle. + * @completion: The data_vio which has just finished its read. + * + * This callback is registered in read_block(). + */ +static void modify_for_partial_write(struct vdo_completion *completion) +{ + struct data_vio *data_vio = as_data_vio(completion); + struct bio *bio = data_vio->user_bio; + + assert_data_vio_on_cpu_thread(data_vio); + + if (bio_op(bio) == REQ_OP_DISCARD) { + memset(data_vio->data_block + data_vio->offset, + '\0', + min_t(uint32_t, + data_vio->remaining_discard, + VDO_BLOCK_SIZE - data_vio->offset)); + } else { + vdo_bio_copy_data_in(bio, + data_vio->data_block + data_vio->offset); + } + + data_vio->is_zero_block = is_zero_block(data_vio->data_block); + data_vio->io_operation = + (DATA_VIO_WRITE | + (data_vio->io_operation & ~DATA_VIO_READ_WRITE_MASK)); + completion->error_handler = NULL; + launch_data_vio_logical_callback(data_vio, continue_partial_write); +} + +static void complete_read(struct vdo_completion *completion) +{ + struct data_vio *data_vio = as_data_vio(completion); + bool compressed = vdo_is_state_compressed(data_vio->mapped.state); + + assert_data_vio_on_cpu_thread(data_vio); + + if (compressed) { + int result = uncompress_data_vio(data_vio, + data_vio->mapped.state, + data_vio->data_block); + + if (result != VDO_SUCCESS) { + finish_data_vio(data_vio, result); + return; + } + } + + if (is_read_modify_write_data_vio(data_vio)) { + modify_for_partial_write(completion); + return; + } + + if (compressed || data_vio->is_partial) { + vdo_bio_copy_data_out(data_vio->user_bio, + data_vio->data_block + data_vio->offset); + } + + acknowledge_data_vio(data_vio); + complete_data_vio(completion); +} + +static void read_endio(struct bio *bio) +{ + struct data_vio *data_vio = vio_as_data_vio(bio->bi_private); + + vdo_count_completed_bios(bio); + launch_data_vio_cpu_callback(data_vio, + complete_read, + CPU_Q_COMPLETE_READ_PRIORITY); +} + +static void complete_zero_read(struct vdo_completion *completion) +{ + struct data_vio *data_vio = as_data_vio(completion); + + assert_data_vio_on_cpu_thread(data_vio); + + if (data_vio->is_partial) { + memset(data_vio->data_block, 0, VDO_BLOCK_SIZE); + if (!is_read_data_vio(data_vio)) { + modify_for_partial_write(completion); + return; + } + } else { + zero_fill_bio(data_vio->user_bio); + } + + complete_read(completion); +} + +/** + * read_block() - Read a block asynchronously. + * @completion: The data_vio to read. + * + * This is the callback registered in read_block_mapping(). + */ +static void read_block(struct vdo_completion *completion) +{ + struct data_vio *data_vio = as_data_vio(completion); + struct vio *vio = as_vio(completion); + int result = VDO_SUCCESS; + + if (completion->result != VDO_SUCCESS) { + complete_data_vio(completion); + return; + } + + completion->error_handler = complete_data_vio; + + if (data_vio->mapped.pbn == VDO_ZERO_BLOCK) { + launch_data_vio_cpu_callback(data_vio, + complete_zero_read, + CPU_Q_COMPLETE_VIO_PRIORITY); + return; + } + + data_vio->last_async_operation = VIO_ASYNC_OP_READ_DATA_VIO; + completion->error_handler = complete_data_vio; + if (vdo_is_state_compressed(data_vio->mapped.state)) { + result = prepare_data_vio_for_io(data_vio, + (char *) data_vio->compression.block, + read_endio, + REQ_OP_READ, + data_vio->mapped.pbn); + } else { + int opf = ((data_vio->user_bio->bi_opf & PASSTHROUGH_FLAGS) | + REQ_OP_READ); + + if (is_read_modify_write_data_vio(data_vio) || + (data_vio->is_partial)) { + result = prepare_data_vio_for_io(data_vio, + data_vio->data_block, + read_endio, + opf, + data_vio->mapped.pbn); + } else { + /* + * A full 4k read. Use the incoming bio to avoid having + * to copy the data + */ + set_vio_physical(vio, data_vio->mapped.pbn); + +#ifdef RHEL_RELEASE_CODE +#define USE_ALTERNATE (RHEL_RELEASE_CODE < RHEL_RELEASE_VERSION(9,1)) +#else +#define USE_ALTERNATE (LINUX_VERSION_CODE < KERNEL_VERSION(5,18,0)) +#endif + +#if USE_ALTERNATE + bio_reset(vio->bio); + __bio_clone_fast(vio->bio, data_vio->user_bio); +#else + bio_reset(vio->bio, vio->bio->bi_bdev, opf); + bio_init_clone(data_vio->user_bio->bi_bdev, + vio->bio, + data_vio->user_bio, + GFP_KERNEL); +#endif + + /* Copy over the original bio iovec and opflags. */ + vdo_set_bio_properties(vio->bio, + vio, + read_endio, + opf, + data_vio->mapped.pbn); + } + } + + if (result != VDO_SUCCESS) { + continue_data_vio(data_vio, result); + return; + } + + submit_data_vio_io(data_vio); +} + +/** + * read_block_mapping() - Read the data_vio's mapping from the block map. + * @completion: The data_vio to be read. + * + * This callback is registered in launch_read_data_vio(). + */ +static void read_block_mapping(struct vdo_completion *completion) +{ + struct data_vio *data_vio = as_data_vio(completion); + + if (completion->result != VDO_SUCCESS) { + complete_data_vio(completion); + return; + } + + assert_data_vio_in_logical_zone(data_vio); + set_data_vio_logical_callback(data_vio, read_block); + data_vio->last_async_operation = VIO_ASYNC_OP_GET_MAPPED_BLOCK_FOR_READ; + vdo_get_mapped_block(data_vio); +} + +/** + * launch_read_data_vio() - Start the asynchronous processing of a read vio. + * @data_vio: The data_vio doing the read. + * + * Starts the asynchronous processing of the data_vio for a read or + * read-modify-write request which has acquired a lock on its logical block. + * The first step is to perform a block map lookup. + */ +void launch_read_data_vio(struct data_vio *data_vio) +{ + assert_data_vio_in_logical_zone(data_vio); + + /* Go find the block map slot for the LBN mapping. */ + vdo_find_block_map_slot(data_vio, + read_block_mapping, + data_vio->logical.zone->thread_id); +} + +/** + * release_logical_lock() - Release the logical block lock which a read + * data_vio obtained now that it is done. + * @completion: The data_vio. + */ +static void release_logical_lock(struct vdo_completion *completion) +{ + struct data_vio *data_vio = as_data_vio(completion); + + assert_data_vio_in_logical_zone(data_vio); + vdo_release_logical_block_lock(data_vio); + release_data_vio(data_vio); +} + +/** + * cleanup_read_data_vio() - Clean up a data_vio which has finished processing + * a read. + * @data_vio: The data_vio to clean up. + */ +void cleanup_read_data_vio(struct data_vio *data_vio) +{ + launch_data_vio_logical_callback(data_vio, release_logical_lock); +} diff --git a/vdo/vio-read.h b/vdo/vio-read.h new file mode 100644 index 00000000..04337290 --- /dev/null +++ b/vdo/vio-read.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright Red Hat + */ + +#ifndef VIO_READ_H +#define VIO_READ_H + +#include "kernel-types.h" + +void launch_read_data_vio(struct data_vio *data_vio); + +void cleanup_read_data_vio(struct data_vio *data_vio); + +#endif /* VIO_READ_H */ diff --git a/vdo/vioWrite.c b/vdo/vio-write.c similarity index 50% rename from vdo/vioWrite.c rename to vdo/vio-write.c index 17b7c653..20c8e4f4 100644 --- a/vdo/vioWrite.c +++ b/vdo/vio-write.c @@ -1,22 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/vioWrite.c#49 $ */ /* @@ -46,12 +30,12 @@ * prepare_for_dedupe() * hashData() * resolve_hash_zone() - * acquire_vdo_hash_lock() + * vdo_acquire_hash_lock() * attemptDedupe() (query UDS) * if (is_duplicate) { * verifyAdvice() (read verify) * if (is_duplicate and canAddReference) { - * share_block() + * launch_deduplicate_data_vio() * addJournalEntryForDedupe() * increment_for_dedupe() * read_old_block_mapping_for_dedupe() @@ -68,7 +52,7 @@ * if (not canAddReference) { * vdo_update_dedupe_index() * } - * # vio_compress_data() + * # launch_compress_data_vio() * if (compressing and not mooted and has no waiters) { * compress_data_vio() * pack_compressed_data() @@ -105,29 +89,35 @@ * } */ -#include "vioWrite.h" +#include "vio-write.h" + +#include +#include #include "logger.h" #include "permassert.h" -#include "allocatingVIO.h" -#include "blockMap.h" -#include "compressionState.h" -#include "dataVIO.h" -#include "hashLock.h" -#include "recoveryJournal.h" -#include "referenceOperation.h" +#include "bio.h" +#include "block-map.h" +#include "compression-state.h" +#include "data-vio.h" +#include "hash-lock.h" +#include "io-submitter.h" +#include "kernel-types.h" +#include "recovery-journal.h" +#include "reference-operation.h" #include "slab.h" -#include "slabDepot.h" -#include "slabJournal.h" -#include "vdoInternal.h" -#include "vioRead.h" +#include "slab-depot.h" +#include "slab-journal.h" +#include "types.h" +#include "vdo.h" +#include "vio-read.h" -/** +/* * The steps taken cleaning up a VIO, in the order they are performed. - **/ + */ enum data_vio_cleanup_stage { - VIO_CLEANUP_START = 0, + VIO_CLEANUP_START, VIO_RELEASE_ALLOCATED = VIO_CLEANUP_START, VIO_RELEASE_RECOVERY_LOCKS, VIO_RELEASE_HASH_LOCK, @@ -135,83 +125,113 @@ enum data_vio_cleanup_stage { VIO_CLEANUP_DONE }; -/** +/* * Actions to take on error used by abort_on_error(). - **/ + */ enum read_only_action { NOT_READ_ONLY, READ_ONLY, }; -// Forward declarations required because of circular function references. +/* Forward declarations required because of circular function references. */ static void perform_cleanup_stage(struct data_vio *data_vio, enum data_vio_cleanup_stage stage); static void write_block(struct data_vio *data_vio); /** - * Release the PBN lock and/or the reference on the allocated block at the - * end of processing a data_vio. - * - * @param completion The data_vio - **/ + * release_allocated_lock() - Release the PBN lock and/or the reference on the + * allocated block at the end of processing a + * data_vio. + * @completion: The data_vio. + */ static void release_allocated_lock(struct vdo_completion *completion) { struct data_vio *data_vio = as_data_vio(completion); + assert_data_vio_in_allocated_zone(data_vio); - vio_release_allocation_lock(data_vio_as_allocating_vio(data_vio)); + release_data_vio_allocation_lock(data_vio, false); perform_cleanup_stage(data_vio, VIO_RELEASE_RECOVERY_LOCKS); } /** - * Release the logical block lock and flush generation lock at the end of - * processing a data_vio. - * - * @param completion The data_vio - **/ + * release_logical_lock() - Release the logical block lock and flush + * generation lock at the end of processing a + * data_vio. + * @completion: The data_vio. + */ static void release_logical_lock(struct vdo_completion *completion) { struct data_vio *data_vio = as_data_vio(completion); + assert_data_vio_in_logical_zone(data_vio); vdo_release_logical_block_lock(data_vio); - release_vdo_flush_generation_lock(data_vio); + vdo_release_flush_generation_lock(data_vio); perform_cleanup_stage(data_vio, VIO_CLEANUP_DONE); } /** - * Release the hash lock at the end of processing a data_vio. - * - * @param completion The data_vio - **/ + * clean_hash_lock() - Release the hash lock at the end of processing a + * data_vio. + * @completion: The data_vio. + */ static void clean_hash_lock(struct vdo_completion *completion) { struct data_vio *data_vio = as_data_vio(completion); + assert_data_vio_in_hash_zone(data_vio); - release_vdo_hash_lock(data_vio); + vdo_release_hash_lock(data_vio); perform_cleanup_stage(data_vio, VIO_RELEASE_LOGICAL); } /** - * Make some assertions about a data_vio which has finished cleaning up - * and do its final callback. + * finish_cleanup() - Make some assertions about a data_vio which has finished + * cleaning up. + * @data_vio: The data_vio which has finished cleaning up. * - * @param data_vio The data_vio which has finished cleaning up - **/ + * If it is part of a multi-block discard, starts on the next block, + * otherwise, returns it to the pool. + */ static void finish_cleanup(struct data_vio *data_vio) { - ASSERT_LOG_ONLY(data_vio_as_allocating_vio(data_vio)->allocation_lock == - NULL, + struct vdo_completion *completion = data_vio_as_completion(data_vio); + enum data_vio_operation operation; + + ASSERT_LOG_ONLY(data_vio->allocation.lock == NULL, "complete data_vio has no allocation lock"); ASSERT_LOG_ONLY(data_vio->hash_lock == NULL, "complete data_vio has no hash lock"); - vio_done_callback(data_vio_as_completion(data_vio)); + if ((data_vio->remaining_discard <= VDO_BLOCK_SIZE) || + (completion->result != VDO_SUCCESS)) { + release_data_vio(data_vio); + return; + } + + data_vio->remaining_discard -= min_t(uint32_t, + data_vio->remaining_discard, + VDO_BLOCK_SIZE - data_vio->offset); + data_vio->is_partial = (data_vio->remaining_discard < VDO_BLOCK_SIZE); + data_vio->offset = 0; + + if (data_vio->is_partial) { + operation = DATA_VIO_READ_MODIFY_WRITE; + } else { + operation = DATA_VIO_WRITE; + } + + if (data_vio->user_bio->bi_opf & REQ_FUA) { + operation |= DATA_VIO_FUA; + } + + completion->requeue = true; + launch_data_vio(data_vio, data_vio->logical.lbn + 1, operation); } /** - * Perform the next step in the process of cleaning up a data_vio. - * - * @param data_vio The data_vio to clean up - * @param stage The cleanup stage to perform - **/ + * perform_cleanup_stage() - Perform the next step in the process of cleaning + * up a data_vio. + * @data_vio: The data_vio to clean up. + * @stage: The cleanup stage to perform. + */ static void perform_cleanup_stage(struct data_vio *data_vio, enum data_vio_cleanup_stage stage) { @@ -226,7 +246,7 @@ static void perform_cleanup_stage(struct data_vio *data_vio, case VIO_RELEASE_RECOVERY_LOCKS: if ((data_vio->recovery_sequence_number > 0) && - !vdo_is_or_will_be_read_only(data_vio_as_vio(data_vio)->vdo->read_only_notifier) && + !vdo_is_or_will_be_read_only(vdo_from_data_vio(data_vio)->read_only_notifier) && (data_vio_as_completion(data_vio)->result != VDO_READ_ONLY)) { uds_log_warning("VDO not read-only when cleaning data_vio with RJ lock"); } @@ -241,12 +261,9 @@ static void perform_cleanup_stage(struct data_vio *data_vio, fallthrough; case VIO_RELEASE_LOGICAL: - if (!is_compressed_write_data_vio(data_vio)) { - launch_data_vio_logical_callback(data_vio, - release_logical_lock); - return; - } - fallthrough; + launch_data_vio_logical_callback(data_vio, + release_logical_lock); + return; default: finish_cleanup(data_vio); @@ -254,30 +271,33 @@ static void perform_cleanup_stage(struct data_vio *data_vio, } /** - * Return a data_vio that encountered an error to its hash lock so it can - * update the hash lock state accordingly. This continuation is registered in - * abort_on_error(), and must be called in the hash zone of the data_vio. + * finish_write_data_vio_with_error() - Return a data_vio that encountered an + * error to its hash lock so it can + * update the hash lock state + * accordingly. + * @completion: The completion of the data_vio to return to its hash lock. * - * @param completion The completion of the data_vio to return to its hash lock - **/ + * This continuation is registered in abort_on_error(), and must be called in + * the hash zone of the data_vio. + */ static void finish_write_data_vio_with_error(struct vdo_completion *completion) { struct data_vio *data_vio = as_data_vio(completion); + assert_data_vio_in_hash_zone(data_vio); - continue_vdo_hash_lock_on_error(data_vio); + vdo_continue_hash_lock_on_error(data_vio); } /** - * Check whether a result is an error, and if so abort the data_vio associated - * with the error. - * - * @param result The result to check - * @param data_vio The data_vio - * @param action The conditions under which the VDO should be put into - * read-only mode if the result is an error + * abort_on_error() - Check whether a result is an error, and if so abort the + * data_vio associated with the error. + * @result: The result to check. + * @data_vio: The data_vio. + * @action: The conditions under which the VDO should be put into read-only + * mode if the result is an error. * - * @return true if the result is an error - **/ + * Return: true if the result is an error. + */ static bool abort_on_error(int result, struct data_vio *data_vio, enum read_only_action action) @@ -288,7 +308,7 @@ static bool abort_on_error(int result, if ((result == VDO_READ_ONLY) || (action == READ_ONLY)) { struct read_only_notifier *notifier = - data_vio_as_vio(data_vio)->vdo->read_only_notifier; + vdo_from_data_vio(data_vio)->read_only_notifier; if (!vdo_is_read_only(notifier)) { if (result != VDO_READ_ONLY) { uds_log_error_strerror(result, @@ -314,54 +334,60 @@ static bool abort_on_error(int result, } /** - * Return a data_vio that finished writing, compressing, or deduplicating to + * finish_write_data_vio() - Return a finished data_vio to its hash lock. + * @completion: The completion of the data_vio to return to its hash lock. + * + * Returns a data_vio that finished writing, compressing, or deduplicating to * its hash lock so it can share the result with any data_vios waiting in the * hash lock, or update UDS, or simply release its share of the lock. This * continuation is registered in update_block_map_for_write(), * update_block_map_for_dedupe(), and abort_deduplication(), and must be * called in the hash zone of the data_vio. - * - * @param completion The completion of the data_vio to return to its hash lock - **/ + */ static void finish_write_data_vio(struct vdo_completion *completion) { struct data_vio *data_vio = as_data_vio(completion); + assert_data_vio_in_hash_zone(data_vio); if (abort_on_error(completion->result, data_vio, READ_ONLY)) { return; } - continue_vdo_hash_lock(data_vio); + vdo_continue_hash_lock(data_vio); } /** - * Abort the data optimization process. - * - * @param data_vio The data_vio which does not deduplicate or compress - **/ + * abort_deduplication() - Abort the data optimization process. + * @data_vio: The data_vio which does not deduplicate or compress. + */ static void abort_deduplication(struct data_vio *data_vio) { if (!data_vio_has_allocation(data_vio)) { - // There was no space to write this block and we failed to - // deduplicate or compress it. + /* + * There was no space to write this block and we failed to + * deduplicate or compress it. + */ finish_data_vio(data_vio, VDO_NO_SPACE); return; } - // We failed to deduplicate or compress so now we need to actually - // write the data. + /* + * We failed to deduplicate or compress so now we need to actually + * write the data. + */ write_block(data_vio); } /** - * Update the block map now that we've added an entry in the recovery journal - * for a block we have just shared. This is the callback registered in - * decrement_for_dedupe(). + * update_block_map_for_dedupe() - Update the block map now that we've added + * an entry in the recovery journal for a block we have just shared. + * @completion: The completion of the write in progress. * - * @param completion The completion of the write in progress - **/ + * This is the callback registered in decrement_for_dedupe(). + */ static void update_block_map_for_dedupe(struct vdo_completion *completion) { struct data_vio *data_vio = as_data_vio(completion); + assert_data_vio_in_logical_zone(data_vio); if (abort_on_error(completion->result, data_vio, READ_ONLY)) { return; @@ -378,46 +404,43 @@ static void update_block_map_for_dedupe(struct vdo_completion *completion) } /** - * Make a recovery journal increment. - * - * @param data_vio The data_vio - * @param lock The pbn_lock on the block being incremented - **/ + * journal_increment() - Make a recovery journal increment. + * @data_vio: The data_vio. + * @lock: The pbn_lock on the block being incremented. + */ static void journal_increment(struct data_vio *data_vio, struct pbn_lock *lock) { - set_up_vdo_reference_operation_with_lock(DATA_INCREMENT, + vdo_set_up_reference_operation_with_lock(VDO_JOURNAL_DATA_INCREMENT, data_vio->new_mapped.pbn, data_vio->new_mapped.state, lock, &data_vio->operation); - add_vdo_recovery_journal_entry(get_vdo_from_data_vio(data_vio)->recovery_journal, + vdo_add_recovery_journal_entry(vdo_from_data_vio(data_vio)->recovery_journal, data_vio); } /** - * Make a recovery journal decrement entry. - * - * @param data_vio The data_vio - **/ + * journal_decrement() - Make a recovery journal decrement entry. + * @data_vio: The data_vio. + */ static void journal_decrement(struct data_vio *data_vio) { - set_up_vdo_reference_operation_with_zone(DATA_DECREMENT, + vdo_set_up_reference_operation_with_zone(VDO_JOURNAL_DATA_DECREMENT, data_vio->mapped.pbn, data_vio->mapped.state, data_vio->mapped.zone, &data_vio->operation); - add_vdo_recovery_journal_entry(get_vdo_from_data_vio(data_vio)->recovery_journal, + vdo_add_recovery_journal_entry(vdo_from_data_vio(data_vio)->recovery_journal, data_vio); } /** - * Make a reference count change. - * - * @param data_vio The data_vio - **/ + * update_reference_count() - Make a reference count change. + * @data_vio: The data_vio. + */ static void update_reference_count(struct data_vio *data_vio) { - struct slab_depot *depot = get_vdo_from_data_vio(data_vio)->depot; + struct slab_depot *depot = vdo_from_data_vio(data_vio)->depot; physical_block_number_t pbn = data_vio->operation.pbn; int result = ASSERT(vdo_is_physical_data_block(depot, pbn), @@ -428,50 +451,55 @@ static void update_reference_count(struct data_vio *data_vio) return; } - add_vdo_slab_journal_entry(get_vdo_slab_journal(depot, pbn), data_vio); + vdo_add_slab_journal_entry(vdo_get_slab_journal(depot, pbn), data_vio); } /** - * Do the decref after a successful dedupe or compression. This is the callback - * registered by journal_unmapping_for_dedupe(). + * decrement_for_dedupe() - Do the decref after a successful dedupe or + * compression. + * @completion: The completion of the write in progress. * - * @param completion The completion of the write in progress - **/ + * This is the callback registered by journal_unmapping_for_dedupe(). + */ static void decrement_for_dedupe(struct vdo_completion *completion) { struct data_vio *data_vio = as_data_vio(completion); - struct allocating_vio *allocating_vio = - data_vio_as_allocating_vio(data_vio); assert_data_vio_in_mapped_zone(data_vio); if (abort_on_error(completion->result, data_vio, READ_ONLY)) { return; } - if (allocating_vio->allocation == data_vio->mapped.pbn) { + if (data_vio->allocation.pbn == data_vio->mapped.pbn) { /* * If we are about to release the reference on the allocated * block, we must release the PBN lock on it first so that the * allocator will not allocate a write-locked block. + * + * FIXME: now that we don't have sync mode, can this ever + * happen? */ - vio_release_allocation_lock(allocating_vio); + release_data_vio_allocation_lock(data_vio, false); } set_data_vio_logical_callback(data_vio, update_block_map_for_dedupe); - data_vio->last_async_operation = VIO_ASYNC_OP_JOURNAL_DECREMENT_FOR_DEDUPE; + data_vio->last_async_operation = + VIO_ASYNC_OP_JOURNAL_DECREMENT_FOR_DEDUPE; update_reference_count(data_vio); } /** - * Write the appropriate journal entry for removing the mapping of logical to - * mapped, for dedupe or compression. This is the callback registered in - * read_old_block_mapping_for_dedupe(). + * journal_unmapping_for_dedupe() - Write the appropriate journal entry for + * removing the mapping of logical to mapped, + * for dedupe or compression. + * @completion: The completion of the write in progress. * - * @param completion The completion of the write in progress - **/ + * This is the callback registered in read_old_block_mapping_for_dedupe(). + */ static void journal_unmapping_for_dedupe(struct vdo_completion *completion) { struct data_vio *data_vio = as_data_vio(completion); + assert_data_vio_in_journal_zone(data_vio); if (abort_on_error(completion->result, data_vio, READ_ONLY)) { return; @@ -489,16 +517,18 @@ static void journal_unmapping_for_dedupe(struct vdo_completion *completion) } /** - * Get the previous PBN mapped to this LBN from the block map, so as to make + * read_old_block_mapping_for_dedupe() - Get the prevoius PBN/LBN mapping. + * @completion: The completion of the write in progress. + * + * Gets the previous PBN mapped to this LBN from the block map, so as to make * an appropriate journal entry referencing the removal of this LBN->PBN * mapping, for dedupe or compression. This callback is registered in * increment_for_dedupe(). - * - * @param completion The completion of the write in progress - **/ + */ static void read_old_block_mapping_for_dedupe(struct vdo_completion *completion) { struct data_vio *data_vio = as_data_vio(completion); + assert_data_vio_in_logical_zone(data_vio); if (abort_on_error(completion->result, data_vio, READ_ONLY)) { return; @@ -510,14 +540,16 @@ static void read_old_block_mapping_for_dedupe(struct vdo_completion *completion) } /** - * Do the incref after compression. This is the callback registered by - * add_recovery_journal_entry_for_compression(). + * increment_for_compression() - Do the incref after compression. + * @completion: The completion of the write in progress. * - * @param completion The completion of the write in progress - **/ + * This is the callback registered by + * add_recovery_journal_entry_for_compression(). + */ static void increment_for_compression(struct vdo_completion *completion) { struct data_vio *data_vio = as_data_vio(completion); + assert_data_vio_in_new_mapped_zone(data_vio); if (abort_on_error(completion->result, data_vio, READ_ONLY)) { return; @@ -535,58 +567,97 @@ static void increment_for_compression(struct vdo_completion *completion) } /** - * Add a recovery journal entry for the increment resulting from compression. + * add_recovery_journal_entry_for_compression() - Add a recovery journal entry + * for the increment resulting + * from compression. + * @completion: The data_vio which has been compressed. * - * @param completion The data_vio which has been compressed - **/ + * This callback is registered in continue_write_after_compression(). + */ static void add_recovery_journal_entry_for_compression(struct vdo_completion *completion) { struct data_vio *data_vio = as_data_vio(completion); + assert_data_vio_in_journal_zone(data_vio); - if (abort_on_error(completion->result, data_vio, READ_ONLY)) { - return; - } + set_data_vio_new_mapped_zone_callback(data_vio, + increment_for_compression); + data_vio->last_async_operation = + VIO_ASYNC_OP_JOURNAL_MAPPING_FOR_COMPRESSION; + journal_increment(data_vio, vdo_get_duplicate_lock(data_vio)); +} + +/** + * continue_write_after_compression() - Continue a write after the data_vio + * has been released from the packer. + * @data_vio: The data_vio which has returned from the packer. + * + * The write may or may not have been written as part of a compressed write. + */ +void continue_write_after_compression(struct data_vio *data_vio) +{ if (!vdo_is_state_compressed(data_vio->new_mapped.state)) { abort_deduplication(data_vio); return; } - set_data_vio_new_mapped_zone_callback(data_vio, - increment_for_compression); - data_vio->last_async_operation = - VIO_ASYNC_OP_JOURNAL_MAPPING_FOR_COMPRESSION; - journal_increment(data_vio, get_vdo_duplicate_lock(data_vio)); + launch_data_vio_journal_callback(data_vio, + add_recovery_journal_entry_for_compression); } /** - * Attempt to pack the compressed data_vio into a block. This is the callback - * registered in vio_compress_data(). + * pack_compressed_data() - Attempt to pack the compressed data_vio into a + * block. + * @completion: The completion of a compressed data_vio. * - * @param completion The completion of a compressed data_vio - **/ + * This is the callback registered in launch_compress_data_vio(). + */ static void pack_compressed_data(struct vdo_completion *completion) { struct data_vio *data_vio = as_data_vio(completion); + assert_data_vio_in_packer_zone(data_vio); - // XXX this is a callback, so there should probably be an error check - // here even if we think compression can't currently return one. + /* + * XXX this is a callback, so there should probably be an error check + * here even if we think compression can't currently return one. + */ if (!may_pack_data_vio(data_vio)) { abort_deduplication(data_vio); return; } - set_data_vio_journal_callback(data_vio, - add_recovery_journal_entry_for_compression); data_vio->last_async_operation = VIO_ASYNC_OP_ATTEMPT_PACKING; vdo_attempt_packing(data_vio); } -/**********************************************************************/ -void vio_compress_data(struct data_vio *data_vio) +/** + * compress_data_vio_callback() - Do the actual work of compressing the data + * on a CPU queue. + * @completion: The completion of the write in progress. + * + * This callback is registered in launch_compress_data_vio(). + */ +static void compress_data_vio_callback(struct vdo_completion *completion) +{ + struct data_vio *data_vio = as_data_vio(completion); + + assert_data_vio_on_cpu_thread(data_vio); + compress_data_vio(data_vio); + launch_data_vio_packer_callback(data_vio, + pack_compressed_data); +} + +/** + * launch_compress_data_vio() - Continue a write by attempting to compress the + * data. + * @data_vio: The data_vio to be compressed. + * + * This is a re-entry point to vio_write used by hash locks. + */ +void launch_compress_data_vio(struct data_vio *data_vio) { ASSERT_LOG_ONLY(!data_vio->is_duplicate, "compressing a non-duplicate block"); @@ -596,28 +667,26 @@ void vio_compress_data(struct data_vio *data_vio) } data_vio->last_async_operation = VIO_ASYNC_OP_COMPRESS_DATA_VIO; - set_data_vio_packer_callback(data_vio, pack_compressed_data); - compress_data_vio(data_vio); + launch_data_vio_cpu_callback(data_vio, + compress_data_vio_callback, + CPU_Q_COMPRESS_BLOCK_PRIORITY); } /** - * Do the incref after deduplication. This is the callback registered by - * add_recovery_journal_entry_for_dedupe(). + * increment_for_dedupe() - Do the incref after deduplication. + * @completion: The completion of the write in progress. * - * @param completion The completion of the write in progress - **/ + * This is the callback registered by add_recovery_journal_entry_for_dedupe(). + */ static void increment_for_dedupe(struct vdo_completion *completion) { struct data_vio *data_vio = as_data_vio(completion); + assert_data_vio_in_new_mapped_zone(data_vio); if (abort_on_error(completion->result, data_vio, READ_ONLY)) { return; } - ASSERT_LOG_ONLY(data_vio->is_duplicate, - "Impossible attempt to update reference counts for a block which was not a duplicate (logical block %llu)", - (unsigned long long) data_vio->logical.lbn); - set_data_vio_logical_callback(data_vio, read_old_block_mapping_for_dedupe); data_vio->last_async_operation = VIO_ASYNC_OP_JOURNAL_INCREMENT_FOR_DEDUPE; @@ -625,15 +694,18 @@ static void increment_for_dedupe(struct vdo_completion *completion) } /** - * Add a recovery journal entry for the increment resulting from deduplication. - * This callback is registered in share_block(). + * add_recovery_journal_entry_for_dedupe() - Add a recovery journal entry for + * the increment resulting from + * deduplication. + * @completion: The data_vio which has been deduplicated. * - * @param completion The data_vio which has been deduplicated - **/ + * This callback is registered in launch_deduplicate_data_vio(). + */ static void add_recovery_journal_entry_for_dedupe(struct vdo_completion *completion) { struct data_vio *data_vio = as_data_vio(completion); + assert_data_vio_in_journal_zone(data_vio); if (abort_on_error(completion->result, data_vio, READ_ONLY)) { return; @@ -641,26 +713,21 @@ add_recovery_journal_entry_for_dedupe(struct vdo_completion *completion) set_data_vio_new_mapped_zone_callback(data_vio, increment_for_dedupe); data_vio->last_async_operation = VIO_ASYNC_OP_JOURNAL_MAPPING_FOR_DEDUPE; - journal_increment(data_vio, get_vdo_duplicate_lock(data_vio)); + journal_increment(data_vio, vdo_get_duplicate_lock(data_vio)); } /** - * Share a block in the block map if it is a duplicate. + * launch_deduplicate_data_vio() - Continue a write by deduplicating a write + * data_vio against a verified existing block + * containing the data. + * @data_vio: The data_vio to be deduplicated. * - * @param completion The completion of the write in progress - **/ -void share_block(struct vdo_completion *completion) + * This is a re-entry point to vio_write used by hash locks. + */ +void launch_deduplicate_data_vio(struct data_vio *data_vio) { - struct data_vio *data_vio = as_data_vio(completion); - assert_data_vio_in_duplicate_zone(data_vio); - if (abort_on_error(completion->result, data_vio, READ_ONLY)) { - return; - } - - if (!data_vio->is_duplicate) { - vio_compress_data(data_vio); - return; - } + ASSERT_LOG_ONLY(data_vio->is_duplicate, + "data_vio must have a duplicate location"); data_vio->new_mapped = data_vio->duplicate; launch_data_vio_journal_callback(data_vio, @@ -668,113 +735,122 @@ void share_block(struct vdo_completion *completion) } /** - * Route the data_vio to the hash_zone responsible for the chunk name to - * acquire a hash lock on that name, or join with a existing hash lock managing - * concurrent dedupe for that name. This is the callback registered in - * resolve_hash_zone(). + * lock_hash_in_zone() - Route the data_vio to the hash_zone responsible for + * the chunk name to acquire a hash lock on that name, + * or join with a existing hash lock managing concurrent + * dedupe for that name. + * @completion: The data_vio to lock. * - * @param completion The data_vio to lock - **/ + * This is the callback registered in hash_data_vio(). + */ static void lock_hash_in_zone(struct vdo_completion *completion) { struct data_vio *data_vio = as_data_vio(completion); int result; assert_data_vio_in_hash_zone(data_vio); - // Shouldn't have had any errors since all we did was switch threads. + /* Shouldn't have had any errors since all we did was switch threads. */ if (abort_on_error(completion->result, data_vio, READ_ONLY)) { return; } - result = acquire_vdo_hash_lock(data_vio); + result = vdo_acquire_hash_lock(data_vio); if (abort_on_error(result, data_vio, READ_ONLY)) { return; } if (data_vio->hash_lock == NULL) { - // It's extremely unlikely, but in the case of a hash collision, - // the data_vio will not obtain a reference to the lock and - // cannot deduplicate. - vio_compress_data(data_vio); + /* + * It's extremely unlikely, but in the case of a hash + * collision, the data_vio will not obtain a reference to the + * lock and cannot deduplicate. + */ + launch_compress_data_vio(data_vio); return; } - enter_vdo_hash_lock(data_vio); + vdo_enter_hash_lock(data_vio); } /** - * Set the hash zone (and flag the chunk name as set) while still on the - * thread that just hashed the data to set the chunk name. This is the - * callback registered by prepare_for_dedupe(). - * - * @param completion The data_vio whose chunk name was just generated, as a - * completion - **/ -static void resolve_hash_zone(struct vdo_completion *completion) + * hash_data_vio() - Hash the data in a data_vio and set the hash zone (which + * also flags the chunk name as set). + * @completion: The data_vio to hash. + + * This callback is registered in prepare_for_dedupe(). + */ +static void hash_data_vio(struct vdo_completion *completion) { struct data_vio *data_vio = as_data_vio(completion); - // We don't care what thread we are on. - if (abort_on_error(completion->result, data_vio, READ_ONLY)) { - return; - } + assert_data_vio_on_cpu_thread(data_vio); ASSERT_LOG_ONLY(!data_vio->is_zero_block, "zero blocks should not be hashed"); + murmurhash3_128(data_vio->data_block, + VDO_BLOCK_SIZE, + 0x62ea60be, + &data_vio->chunk_name); + data_vio->hash_zone = - select_hash_zone(get_vdo_from_data_vio(data_vio), - &data_vio->chunk_name); + vdo_select_hash_zone(vdo_from_data_vio(data_vio), + &data_vio->chunk_name); data_vio->last_async_operation = VIO_ASYNC_OP_ACQUIRE_VDO_HASH_LOCK; - launch_data_vio_hash_zone_callback(data_vio, lock_hash_in_zone); + launch_data_vio_hash_zone_callback(data_vio, + lock_hash_in_zone); } /** - * Prepare for the dedupe path after attempting to get an allocation. This - * callback is both registered in and called directly from - * continue_write_after_allocation(). - * - * @param completion The completion of the write in progress - **/ -static void prepare_for_dedupe(struct vdo_completion *completion) + * prepare_for_dedupe() - Prepare for the dedupe path after attempting to get + * an allocation. + * @data_vio: The data_vio to deduplicate. + */ +static void prepare_for_dedupe(struct data_vio *data_vio) { - struct data_vio *data_vio = as_data_vio(completion); - // We don't care what thread we are on - if (abort_on_error(completion->result, data_vio, READ_ONLY)) { + /* We don't care what thread we are on */ + if (abort_on_error(data_vio_as_completion(data_vio)->result, + data_vio, + READ_ONLY)) { return; } ASSERT_LOG_ONLY(!data_vio->is_zero_block, "must not prepare to dedupe zero blocks"); - // Before we can dedupe, we need to know the chunk name, so the first - // step is to hash the block data. + /* + * Before we can dedupe, we need to know the chunk name, so the first + * step is to hash the block data. + */ data_vio->last_async_operation = VIO_ASYNC_OP_HASH_DATA_VIO; - // XXX this is the wrong thread to run this callback, but we don't yet - // have a mechanism for running it on the CPU thread immediately after - // hashing. - set_data_vio_allocated_zone_callback(data_vio, resolve_hash_zone); - hash_data_vio(data_vio); + launch_data_vio_cpu_callback(data_vio, + hash_data_vio, + CPU_Q_HASH_BLOCK_PRIORITY); } /** - * Update the block map after a data write (or directly for a VDO_ZERO_BLOCK - * write or trim). This callback is registered in decrement_for_write() and - * journal_unmapping_for_write(). + * update_block_map_for_write() - Update the block map after a data write (or + * directly for a VDO_ZERO_BLOCK write or + * trim). + * @completion: The completion of the write in progress. * - * @param completion The completion of the write in progress - **/ + * This callback is registered in decrement_for_write() and + * journal_unmapping_for_write(). + */ static void update_block_map_for_write(struct vdo_completion *completion) { struct data_vio *data_vio = as_data_vio(completion); + assert_data_vio_in_logical_zone(data_vio); if (abort_on_error(completion->result, data_vio, READ_ONLY)) { return; } if (data_vio->hash_lock != NULL) { - // The write is finished, but must return to the hash lock to - // allow other data VIOs with the same data to dedupe against - // the write. + /* + * The write is finished, but must return to the hash lock to + * allow other data VIOs with the same data to dedupe against + * the write. + */ set_data_vio_hash_zone_callback(data_vio, finish_write_data_vio); } else { completion->callback = complete_data_vio; @@ -785,14 +861,16 @@ static void update_block_map_for_write(struct vdo_completion *completion) } /** - * Do the decref after a successful block write. This is the callback - * by journal_unmapping_for_write() if the old mapping was not the zero block. + * decrement_for_write() - Do the decref after a successful block write. + * @completion: The completion of the write in progress. * - * @param completion The completion of the write in progress - **/ + * This is the callback by journal_unmapping_for_write() if the old mapping + * was not the zero block. + */ static void decrement_for_write(struct vdo_completion *completion) { struct data_vio *data_vio = as_data_vio(completion); + assert_data_vio_in_mapped_zone(data_vio); if (abort_on_error(completion->result, data_vio, READ_ONLY)) { return; @@ -804,14 +882,16 @@ static void decrement_for_write(struct vdo_completion *completion) } /** - * Write the appropriate journal entry for unmapping logical to mapped for a - * write. This is the callback registered in read_old_block_mapping_for_write(). + * journal_unmapping_for_write() - Write the appropriate journal entry for + * unmapping logical to mapped for a write. + * @completion: The completion of the write in progress. * - * @param completion The completion of the write in progress - **/ + * This is the callback registered in read_old_block_mapping_for_write(). + */ static void journal_unmapping_for_write(struct vdo_completion *completion) { struct data_vio *data_vio = as_data_vio(completion); + assert_data_vio_in_journal_zone(data_vio); if (abort_on_error(completion->result, data_vio, READ_ONLY)) { return; @@ -829,15 +909,19 @@ static void journal_unmapping_for_write(struct vdo_completion *completion) } /** - * Get the previous PBN mapped to this LBN from the block map for a write, so - * as to make an appropriate journal entry referencing the removal of this - * LBN->PBN mapping. This callback is registered in finish_block_write(). + * read_old_block_mapping_for_write() - Get the previous PBN mapped to this + * LBN from the block map for a write, so + * as to make an appropriate journal + * entry referencing the removal of this + * LBN->PBN mapping. + * @completion: The completion of the write in progress. * - * @param completion The completion of the write in progress - **/ + * This callback is registered in finish_block_write(). + */ static void read_old_block_mapping_for_write(struct vdo_completion *completion) { struct data_vio *data_vio = as_data_vio(completion); + assert_data_vio_in_logical_zone(data_vio); if (abort_on_error(completion->result, data_vio, READ_ONLY)) { return; @@ -849,27 +933,15 @@ static void read_old_block_mapping_for_write(struct vdo_completion *completion) } /** - * Acknowledge a write to the requestor. + * increment_for_write() - Do the incref after a successful block write. + * @completion: The completion of the write in progress. * - * @param data_vio The data_vio being acknowledged - **/ -static void acknowledge_write(struct data_vio *data_vio) -{ - ASSERT_LOG_ONLY(data_vio->has_flush_generation_lock, - "write VIO to be acknowledged has a flush generation lock"); - data_vio->last_async_operation = VIO_ASYNC_OP_ACKNOWLEDGE_WRITE; - acknowledge_data_vio(data_vio); -} - -/** - * Do the incref after a successful block write. This is the callback - * registered by finish_block_write(). - * - * @param completion The completion of the write in progress - **/ + * This is the callback registered by finish_block_write(). + */ static void increment_for_write(struct vdo_completion *completion) { struct data_vio *data_vio = as_data_vio(completion); + assert_data_vio_in_allocated_zone(data_vio); if (abort_on_error(completion->result, data_vio, READ_ONLY)) { return; @@ -880,24 +952,27 @@ static void increment_for_write(struct vdo_completion *completion) * the block. Downgrade the allocation lock to a read lock so it can be * used later by the hash lock. */ - downgrade_vdo_pbn_write_lock(data_vio_as_allocating_vio(data_vio)->allocation_lock); + vdo_downgrade_pbn_write_lock(data_vio->allocation.lock, false); - data_vio->last_async_operation = VIO_ASYNC_OP_JOURNAL_INCREMENT_FOR_WRITE; + data_vio->last_async_operation = + VIO_ASYNC_OP_JOURNAL_INCREMENT_FOR_WRITE; set_data_vio_logical_callback(data_vio, read_old_block_mapping_for_write); update_reference_count(data_vio); } /** - * Add an entry in the recovery journal after a successful block write. This is - * the callback registered by write_block(). It is also registered in - * allocate_block_for_write(). + * finish_block_write() - Add an entry in the recovery journal after a + * successful block write. + * @completion: The completion of the write in progress. * - * @param completion The completion of the write in progress - **/ + * This is the callback registered by write_block(). It is also registered in + * allocate_block_for_write(). + */ static void finish_block_write(struct vdo_completion *completion) { struct data_vio *data_vio = as_data_vio(completion); + assert_data_vio_in_journal_zone(data_vio); if (abort_on_error(completion->result, data_vio, READ_ONLY)) { return; @@ -912,75 +987,150 @@ static void finish_block_write(struct vdo_completion *completion) } data_vio->last_async_operation = VIO_ASYNC_OP_JOURNAL_MAPPING_FOR_WRITE; - journal_increment(data_vio, - data_vio_as_allocating_vio(data_vio)->allocation_lock); + journal_increment(data_vio, data_vio->allocation.lock); } /** - * Write data to the underlying storage. - * - * @param data_vio The data_vio to write - **/ + * write_bio_finished() - This is the bio_end_io functon registered in + * write_block() to be called when a data_vio's write + * to the underlying storage has completed. + * @bio: The bio which has just completed. + */ +static void write_bio_finished(struct bio *bio) +{ + struct data_vio *data_vio + = vio_as_data_vio((struct vio *) bio->bi_private); + + vdo_count_completed_bios(bio); + vdo_set_completion_result(data_vio_as_completion(data_vio), + vdo_get_bio_result(bio)); + launch_data_vio_journal_callback(data_vio, + finish_block_write); +} + +/** + * write_block() - Write data to the underlying storage. + * @data_vio: The data_vio to write. + */ static void write_block(struct data_vio *data_vio) { + int result; + + /* Write the data from the data block buffer. */ + result = prepare_data_vio_for_io(data_vio, + data_vio->data_block, + write_bio_finished, + REQ_OP_WRITE, + data_vio->allocation.pbn); + if (abort_on_error(result, data_vio, READ_ONLY)) { + return; + } + data_vio->last_async_operation = VIO_ASYNC_OP_WRITE_DATA_VIO; - set_data_vio_journal_callback(data_vio, finish_block_write); - write_data_vio(data_vio); + submit_data_vio_io(data_vio); } /** - * Continue the write path for a data_vio now that block allocation is complete - * (the data_vio may or may not have actually received an allocation). This - * callback is registered in continue_write_with_block_map_slot(). + * acknowledge_write_callback() - Acknowledge a write to the requestor. + * @completion: The data_vio being acknowledged. * - * @param allocating_vio The data_vio which has finished the allocation - * process (as an allocating_vio) - **/ -static void -continue_write_after_allocation(struct allocating_vio *allocating_vio) + * This callback is registered in allocate_block() and + * continue_write_with_block_map_slot(). + */ +static void acknowledge_write_callback(struct vdo_completion *completion) { - struct data_vio *data_vio = allocating_vio_as_data_vio(allocating_vio); - if (abort_on_error(data_vio_as_completion(data_vio)->result, - data_vio, - NOT_READ_ONLY)) { + struct data_vio *data_vio = as_data_vio(completion); + struct vdo *vdo = completion->vdo; + + ASSERT_LOG_ONLY((!vdo_uses_bio_ack_queue(vdo) + || (vdo_get_callback_thread_id() == + vdo->thread_config->bio_ack_thread)), + "acknowledge_write_callback() called on bio ack queue"); + ASSERT_LOG_ONLY(data_vio->has_flush_generation_lock, + "write VIO to be acknowledged has a flush generation lock"); + acknowledge_data_vio(data_vio); + if (data_vio->new_mapped.pbn == VDO_ZERO_BLOCK) { + /* This is a zero write or discard */ + launch_data_vio_journal_callback(data_vio, finish_block_write); return; } - if (!data_vio_has_allocation(data_vio)) { - prepare_for_dedupe(data_vio_as_completion(data_vio)); + prepare_for_dedupe(data_vio); +} + +/** + * allocate_block() - Attempt to allocate a block in the current allocation + * zone. + * @completion: The data_vio needing an allocation. + * + * This callback is registered in continue_write_with_block_map_slot(). + */ +static void allocate_block(struct vdo_completion *completion) +{ + struct data_vio *data_vio = as_data_vio(completion); + + assert_data_vio_in_allocated_zone(data_vio); + + if (!vdo_allocate_block_in_zone(data_vio)) { return; } + completion->error_handler = NULL; WRITE_ONCE(data_vio->allocation_succeeded, true); data_vio->new_mapped = (struct zoned_pbn) { - .zone = allocating_vio->zone, - .pbn = allocating_vio->allocation, + .zone = data_vio->allocation.zone, + .pbn = data_vio->allocation.pbn, .state = VDO_MAPPING_STATE_UNCOMPRESSED, }; - // XXX prepare_for_dedupe can run from any thread, so this is a place - // where running the callback on the kernel thread would save a thread - // switch. - set_data_vio_allocated_zone_callback(data_vio, prepare_for_dedupe); - if (vio_requires_flush_after(allocating_vio_as_vio(allocating_vio))) { - invoke_vdo_completion_callback(data_vio_as_completion(data_vio)); + if (data_vio_requires_fua(data_vio)) { + prepare_for_dedupe(data_vio); return; } - acknowledge_write(data_vio); + data_vio->last_async_operation = VIO_ASYNC_OP_ACKNOWLEDGE_WRITE; + launch_data_vio_on_bio_ack_queue(data_vio, acknowledge_write_callback); } /** - * Continue the write path for a VIO now that block map slot resolution is - * complete. This callback is registered in launch_write_data_vio(). + * handle_allocation_error() - Handle an error attempting to allocate a block. + * @completion: The data_vio needing an allocation. * - * @param completion The data_vio to write - **/ + * This error handler is registered in continue_write_with_block_map_slot(). + */ +static void handle_allocation_error(struct vdo_completion *completion) +{ + struct data_vio *data_vio = as_data_vio(completion); + + completion->error_handler = NULL; + if (completion->result == VDO_NO_SPACE) { + /* We failed to get an allocation, but we can try to dedupe. */ + vdo_reset_completion(completion); + prepare_for_dedupe(data_vio); + return; + } + + /* + * There was an actual error (not just that we didn't get an + * allocation. + */ + finish_data_vio(data_vio, completion->result); +} + +/** + * continue_write_with_block_map_slot() - Continue the write path for a VIO + * now that block map slot resolution + * is complete. + * @completion: The data_vio to write. + * + * This callback is registered in launch_write_data_vio(). + */ static void continue_write_with_block_map_slot(struct vdo_completion *completion) { struct data_vio *data_vio = as_data_vio(completion); - // We don't care what thread we're on. + + /* We don't care what thread we're on. */ if (abort_on_error(completion->result, data_vio, NOT_READ_ONLY)) { return; } @@ -994,49 +1144,75 @@ continue_write_with_block_map_slot(struct vdo_completion *completion) return; } - // This is a trim for a block on a block map page which has not - // been allocated, so there's nothing more we need to do. + /* + * This is a trim for a block on a block map page which has not + * been allocated, so there's nothing more we need to do. + */ finish_data_vio(data_vio, VDO_SUCCESS); return; } - if (data_vio->is_zero_block || is_trim_data_vio(data_vio)) { - // We don't need to write any data, so skip allocation and just - // update the block map and reference counts (via the journal). - data_vio->new_mapped.pbn = VDO_ZERO_BLOCK; + if (!data_vio->is_zero_block && !is_trim_data_vio(data_vio)) { + data_vio_allocate_data_block(data_vio, + VIO_WRITE_LOCK, + allocate_block, + handle_allocation_error); + return; + } + + + /* + * We don't need to write any data, so skip allocation and just + * update the block map and reference counts (via the journal). + */ + data_vio->new_mapped.pbn = VDO_ZERO_BLOCK; + if (data_vio->remaining_discard > VDO_BLOCK_SIZE) { + /* + * This is not the final block of a discard so we can't + * acknowledge it yet. + */ launch_data_vio_journal_callback(data_vio, finish_block_write); return; } - vio_allocate_data_block(data_vio_as_allocating_vio(data_vio), - get_vdo_logical_zone_allocation_selector(data_vio->logical.zone), - VIO_WRITE_LOCK, continue_write_after_allocation); + data_vio->last_async_operation = VIO_ASYNC_OP_ACKNOWLEDGE_WRITE; + launch_data_vio_on_bio_ack_queue(data_vio, acknowledge_write_callback); } -/**********************************************************************/ +/** + * launch_write_data_vio() - Start the asynchronous processing of a data_vio + * for a write request which has acquired a lock on + * its logical block by joining the current flush + * generation and then attempting to allocate a + * physical block. + * @data_vio: The data_vio doing the write. + */ void launch_write_data_vio(struct data_vio *data_vio) { int result; - if (vdo_is_read_only(data_vio_as_vio(data_vio)->vdo->read_only_notifier)) { + if (vdo_is_read_only(vdo_from_data_vio(data_vio)->read_only_notifier)) { finish_data_vio(data_vio, VDO_READ_ONLY); return; } - // Write requests join the current flush generation. - result = acquire_vdo_flush_generation_lock(data_vio); + /* Write requests join the current flush generation. */ + result = vdo_acquire_flush_generation_lock(data_vio); if (abort_on_error(result, data_vio, NOT_READ_ONLY)) { return; } - // Go find the block map slot for the LBN mapping. - data_vio->last_async_operation = VIO_ASYNC_OP_FIND_BLOCK_MAP_SLOT; + /* Go find the block map slot for the LBN mapping. */ vdo_find_block_map_slot(data_vio, continue_write_with_block_map_slot, - get_vdo_logical_zone_thread_id(data_vio->logical.zone)); + data_vio->logical.zone->thread_id); } -/**********************************************************************/ +/** + * cleanup_write_data_vio() - Clean up a data_vio which has finished + * processing a write. + * @data_vio: The data_vio to clean up. + */ void cleanup_write_data_vio(struct data_vio *data_vio) { perform_cleanup_stage(data_vio, VIO_CLEANUP_START); diff --git a/vdo/vio-write.h b/vdo/vio-write.h new file mode 100644 index 00000000..4c68e0b8 --- /dev/null +++ b/vdo/vio-write.h @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright Red Hat + */ + +#ifndef VIO_WRITE_H +#define VIO_WRITE_H + +#include "kernel-types.h" + +void launch_write_data_vio(struct data_vio *data_vio); + +void cleanup_write_data_vio(struct data_vio *data_vio); + +void continue_write_after_compression(struct data_vio *data_vio); + +void launch_compress_data_vio(struct data_vio *data_vio); + +void launch_deduplicate_data_vio(struct data_vio *data_vio); + +#endif /* VIO_WRITE_H */ diff --git a/vdo/vio.c b/vdo/vio.c index a6e406ab..f770a7cb 100644 --- a/vdo/vio.c +++ b/vdo/vio.c @@ -1,22 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/vio.c#24 $ */ #include "vio.h" @@ -25,44 +9,69 @@ #include #include "logger.h" -#include "memoryAlloc.h" +#include "memory-alloc.h" #include "permassert.h" -#include "dataVIO.h" -#include "vdoInternal.h" +#include "bio.h" +#include "io-submitter.h" +#include "vdo.h" -/**********************************************************************/ -int create_metadata_vio(struct vdo *vdo, - enum vio_type vio_type, - enum vio_priority priority, - void *parent, - char *data, - struct vio **vio_ptr) +/** + * create_multi_block_metadata_vio() - Create a vio. + * @vdo: The vdo on which the vio will operate. + * @vio_type: The type of vio to create. + * @priority: The relative priority to assign to the vio. + * @parent: The parent of the vio. + * @block_count: The size of the vio in blocks. + * @data: The buffer. + * @vio_ptr: A pointer to hold the new vio. + * + * Return: VDO_SUCCESS or an error. + */ +int create_multi_block_metadata_vio(struct vdo *vdo, + enum vio_type vio_type, + enum vio_priority priority, + void *parent, + unsigned int block_count, + char *data, + struct vio **vio_ptr) { struct vio *vio; struct bio *bio; int result; - // If struct vio grows past 256 bytes, we'll lose benefits of - // VDOSTORY-176. + /* + * If struct vio grows past 256 bytes, we'll lose benefits of + * VDOSTORY-176. + */ STATIC_ASSERT(sizeof(struct vio) <= 256); - result = ASSERT(is_vdo_metadata_vio_type(vio_type), + result = ASSERT(block_count <= MAX_BLOCKS_PER_VIO, + "block count %u does not exceed maximum %u", + block_count, + MAX_BLOCKS_PER_VIO); + if (result != VDO_SUCCESS) { + return result; + } + + result = ASSERT(vdo_is_metadata_vio_type(vio_type), "%d is a metadata type", vio_type); if (result != VDO_SUCCESS) { return result; } - // Metadata vios should use direct allocation and not use the buffer - // pool, which is reserved for submissions from the linux block layer. + /* + * Metadata vios should use direct allocation and not use the buffer + * pool, which is reserved for submissions from the linux block layer. + */ result = UDS_ALLOCATE(1, struct vio, __func__, &vio); if (result != VDO_SUCCESS) { uds_log_error("metadata vio allocation failure %d", result); return result; } - result = vdo_create_bio(&bio); + result = vdo_create_multi_block_bio(block_count, &bio); if (result != VDO_SUCCESS) { UDS_FREE(vio); return result; @@ -70,17 +79,20 @@ int create_metadata_vio(struct vdo *vdo, initialize_vio(vio, bio, + block_count, vio_type, priority, - parent, - vdo, - data); + vdo); + vio->completion.parent = parent; + vio->data = data; *vio_ptr = vio; return VDO_SUCCESS; - } -/**********************************************************************/ +/** + * free_vio() - Destroy a vio. + * @vio: The vio to destroy. + */ void free_vio(struct vio *vio) { if (vio == NULL) { @@ -92,80 +104,12 @@ void free_vio(struct vio *vio) UDS_FREE(vio); } -/**********************************************************************/ -void initialize_vio(struct vio *vio, - struct bio *bio, - enum vio_type vio_type, - enum vio_priority priority, - struct vdo_completion *parent, - struct vdo *vdo, - char *data) -{ - struct vdo_completion *completion = vio_as_completion(vio); - - vio->bio = bio; - vio->vdo = vdo; - vio->type = vio_type; - vio->priority = priority; - vio->data = data; - - initialize_vdo_completion(completion, vdo, VIO_COMPLETION); - completion->parent = parent; -} - -/**********************************************************************/ -void vio_done_callback(struct vdo_completion *completion) -{ - struct vio *vio = as_vio(completion); - completion->callback = vio->callback; - completion->error_handler = vio->error_handler; - complete_vdo_completion(completion); -} - -/**********************************************************************/ -void get_vio_operation_description(const struct vio *vio, char *buffer) -{ - int buffer_remaining = VDO_VIO_OPERATION_DESCRIPTION_MAX_LENGTH; - - static const char *operations[] = { - [VIO_UNSPECIFIED_OPERATION] = "empty", - [VIO_READ] = "read", - [VIO_WRITE] = "write", - [VIO_READ_MODIFY_WRITE] = "read-modify-write", - }; - int written = snprintf(buffer, buffer_remaining, "%s", - operations[vio->operation & VIO_READ_WRITE_MASK]); - if ((written < 0) || (buffer_remaining < written)) { - // Should never happen, but if it does, we've done as much - // description as possible. - return; - } - - buffer += written; - buffer_remaining -= written; - - if (vio->operation & VIO_FLUSH_BEFORE) { - written = snprintf(buffer, buffer_remaining, "+preflush"); - } - - if ((written < 0) || (buffer_remaining < written)) { - // Should never happen, but if it does, we've done as much - // description as possible. - return; - } - - buffer += written; - buffer_remaining -= written; - - if (vio->operation & VIO_FLUSH_AFTER) { - snprintf(buffer, buffer_remaining, "+postflush"); - } - - STATIC_ASSERT(sizeof("write+preflush+postflush") <= - VDO_VIO_OPERATION_DESCRIPTION_MAX_LENGTH); -} - -/**********************************************************************/ +/** + * update_vio_error_stats() - Update per-vio error stats and log the + * error. + * @vio: The vio which got an error. + * @format: The format of the message to log (a printf style format). + */ void update_vio_error_stats(struct vio *vio, const char *format, ...) { static DEFINE_RATELIMIT_STATE(error_limiter, @@ -174,15 +118,15 @@ void update_vio_error_stats(struct vio *vio, const char *format, ...) va_list args; int priority; + struct vdo_completion *completion = vio_as_completion(vio); - int result = vio_as_completion(vio)->result; - switch (result) { + switch (completion->result) { case VDO_READ_ONLY: - atomic64_inc(&vio->vdo->stats.read_only_error_count); + atomic64_inc(&completion->vdo->stats.read_only_error_count); return; case VDO_NO_SPACE: - atomic64_inc(&vio->vdo->stats.no_space_error_count); + atomic64_inc(&completion->vdo->stats.no_space_error_count); priority = UDS_LOG_DEBUG; break; @@ -195,50 +139,33 @@ void update_vio_error_stats(struct vio *vio, const char *format, ...) } va_start(args, format); - uds_vlog_strerror(priority, result, UDS_LOGGING_MODULE_NAME, - format, args); + uds_vlog_strerror(priority, + completion->result, + UDS_LOGGING_MODULE_NAME, + format, + args); va_end(args); } -/** - * Handle an error from a metadata I/O. - * - * @param completion The vio - **/ -static void handle_metadata_io_error(struct vdo_completion *completion) +void record_metadata_io_error(struct vio *vio) { - struct vio *vio = as_vio(completion); - char vio_operation[VDO_VIO_OPERATION_DESCRIPTION_MAX_LENGTH]; - get_vio_operation_description(vio, vio_operation); + const char *description; + + if (bio_op(vio->bio) == REQ_OP_READ) { + description = "read"; + } else if ((vio->bio->bi_opf & REQ_PREFLUSH) == REQ_PREFLUSH) { + description = (((vio->bio->bi_opf & REQ_FUA) == REQ_FUA) ? + "write+preflush+fua" : + "write+preflush"); + } else if ((vio->bio->bi_opf & REQ_FUA) == REQ_FUA) { + description = "write+fua"; + } else { + description = "write"; + } + update_vio_error_stats(vio, "Completing %s vio of type %u for physical block %llu with error", - vio_operation, + description, vio->type, (unsigned long long) vio->physical); - vio_done_callback(completion); -} - -/**********************************************************************/ -void launch_metadata_vio(struct vio *vio, - physical_block_number_t physical, - vdo_action *callback, - vdo_action *error_handler, - enum vio_operation operation) -{ - struct vdo_completion *completion = vio_as_completion(vio); - const struct admin_state_code *code = get_vdo_admin_state(vio->vdo); - - ASSERT_LOG_ONLY(!code->quiescent, - "I/O not allowed in state %s", - code->name); - vio->operation = operation; - vio->physical = physical; - vio->callback = callback; - vio->error_handler = error_handler; - - reset_vdo_completion(completion); - completion->callback = vio_done_callback; - completion->error_handler = handle_metadata_io_error; - - submit_metadata_vio(vio); } diff --git a/vdo/vio.h b/vdo/vio.h index d95a2e44..53fc50c4 100644 --- a/vdo/vio.h +++ b/vdo/vio.h @@ -1,55 +1,41 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/vio.h#27 $ */ #ifndef VIO_H #define VIO_H +#include #include -#include "kernelLayer.h" +#include "bio.h" #include "completion.h" +#include "constants.h" +#include "kernel-types.h" +#include "thread-config.h" #include "types.h" #include "vdo.h" -/** +enum { + MAX_BLOCKS_PER_VIO = (BIO_MAX_VECS << PAGE_SHIFT) / VDO_BLOCK_SIZE, +}; + +/* * A representation of a single block which may be passed between the VDO base * and the physical layer. - **/ + */ struct vio { /* The completion for this vio */ struct vdo_completion completion; - /* The functions to call when this vio's operation is complete */ - vdo_action *callback; - vdo_action *error_handler; - - /* The vdo handling this vio */ - struct vdo *vdo; - - /* The address on the underlying device of the block to be read/written + /* + * The address on the underlying device of the block to be read/written */ physical_block_number_t physical; - /* The type of request this vio is servicing */ - enum vio_operation operation; + /* The bio zone in which I/O should be processed */ + zone_count_t bio_zone; /* The queueing priority of the vio operation */ enum vio_priority priority; @@ -57,360 +43,242 @@ struct vio { /* The vio type is used for statistics and instrumentation. */ enum vio_type type; + /* The size of this vio in blocks */ + unsigned int block_count; + /* The data being read or written. */ char *data; /* The VDO-owned bio to use for all IO for this vio */ struct bio *bio; - /** + /* * A list of enqueued bios with consecutive block numbers, stored by * vdo_submit_bio() under the first-enqueued vio. The other vios are * found via their bio entries in this list, and are not added to - * the work queue as separate work items. - **/ + * the work queue as separate completions. + */ struct bio_list bios_merged; - /** A slot for an arbitrary bit of data, for use by systemtap. */ - long debug_slot; }; /** - * Convert a generic vdo_completion to a vio. + * as_vio() - Convert a generic vdo_completion to a vio. + * @completion: The completion to convert. * - * @param completion The completion to convert - * - * @return The completion as a vio - **/ + * Return: The completion as a vio. + */ static inline struct vio *as_vio(struct vdo_completion *completion) { - assert_vdo_completion_type(completion->type, VIO_COMPLETION); + vdo_assert_completion_type(completion->type, VIO_COMPLETION); return container_of(completion, struct vio, completion); } /** - * Returns a pointer to the vio wrapping a work item - * - * @param item the work item - * - * @return the vio - **/ -static inline struct vio * __must_check -work_item_as_vio(struct vdo_work_item *item) -{ - return as_vio(container_of(item, struct vdo_completion, work_item)); -} - -/** - * Convert a vio to a generic completion. - * - * @param vio The vio to convert + * vio_as_completion() - Convert a vio to a generic completion. + * @vio: The vio to convert. * - * @return The vio as a completion - **/ + * Return: The vio as a completion. + */ static inline struct vdo_completion *vio_as_completion(struct vio *vio) { return &vio->completion; } /** - * Extracts the work item from a vio. + * vdo_from_vio() - Get the vdo from a vio. + * @vio: The vio from which to get the vdo. * - * @param vio the vio - * - * @return the vio's work item - **/ -static inline struct vdo_work_item *work_item_from_vio(struct vio *vio) + * Return: The vdo to which the vio belongs. + */ +static inline struct vdo *vdo_from_vio(struct vio *vio) { - return &vio_as_completion(vio)->work_item; + return vio_as_completion(vio)->vdo; } /** - * Create a vio. + * set_vio_physical() - Set the physical field of a vio. + * @vio: The vio. + * @pbn: The pbn to set as the vio's physical address. * - * @param [in] vdo The vdo on which the vio will operate - * @param [in] vio_type The type of vio to create - * @param [in] priority The relative priority to assign to the vio - * @param [in] parent The parent of the vio - * @param [in] data The buffer - * @param [out] vio_ptr A pointer to hold the new vio - * - * @return VDO_SUCCESS or an error - **/ -int __must_check create_metadata_vio(struct vdo *vdo, - enum vio_type vio_type, - enum vio_priority priority, - void *parent, - char *data, - struct vio **vio_ptr); - -/** - * Destroy a vio. - * - * @param vio The vio to destroy - **/ -void free_vio(struct vio *vio); - -/** - * Initialize a vio. - * - * @param vio The vio to initialize - * @param bio The bio this vio should use for its I/O - * @param vio_type The vio type - * @param priority The relative priority of the vio - * @param parent The parent (the extent completion) to assign to the vio - * completion - * @param vdo The vdo for this vio - * @param data The data buffer for this vio - **/ -void initialize_vio(struct vio *vio, - struct bio *bio, - enum vio_type vio_type, - enum vio_priority priority, - struct vdo_completion *parent, - struct vdo *vdo, - char *data); - -/** - * The very last step in processing a vio. Set the vio's completion's callback - * and error handler from the fields set in the vio itself on launch and then - * actually complete the vio's completion. - * - * @param completion The vio - **/ -void vio_done_callback(struct vdo_completion *completion); - -/** - * Get the description of a vio's operation. - * - * The output buffer must have size VDO_VIO_OPERATION_DESCRIPTION_MAX_LENGTH. - * - * @param vio The vio - * @param buffer The buffer to populate with the vio operation name. - **/ -void get_vio_operation_description(const struct vio *vio, char *buffer); - -/** - * Update per-vio error stats and log the error. - * - * @param vio The vio which got an error - * @param format The format of the message to log (a printf style format) - **/ -void update_vio_error_stats(struct vio *vio, const char *format, ...) - __attribute__((format(printf, 2, 3))); - -/** - * Add a trace record for the current source location. - * - * @param vio The vio structure to be updated - * @param location The source-location descriptor to be recorded - **/ - - -/** - * Check whether a vio is servicing an external data request. - * - * @param vio The vio to check - **/ -static inline bool is_data_vio(struct vio *vio) + * Also computes the bio zone for doing I/O to that address. + */ +static inline void +set_vio_physical(struct vio *vio, physical_block_number_t pbn) { - return is_vdo_data_vio_type(vio->type); + vio->physical = pbn; + vio->bio_zone = vdo_get_bio_zone(vdo_from_vio(vio), pbn); } /** - * Check whether a vio is for compressed block writes + * get_vio_bio_zone_thread_id() - Get the thread id of the bio zone in which a + * vio should submit its I/O. + * @vio: The vio. * - * @param vio The vio to check - **/ -static inline bool is_compressed_write_vio(struct vio *vio) + * Return: The id of the bio zone thread the vio should use. + */ +static inline thread_id_t __must_check +get_vio_bio_zone_thread_id(struct vio *vio) { - return is_vdo_compressed_write_vio_type(vio->type); + return vdo_from_vio(vio)->thread_config->bio_threads[vio->bio_zone]; } /** - * Check whether a vio is for metadata - * - * @param vio The vio to check - **/ -static inline bool is_metadata_vio(struct vio *vio) + * assert_vio_in_bio_zone() - Check that a vio is running on the correct + * thread for its bio zone. + * @vio: The vio to check. + */ +static inline void +assert_vio_in_bio_zone(struct vio *vio) { - return is_vdo_metadata_vio_type(vio->type); + thread_id_t expected = get_vio_bio_zone_thread_id(vio); + thread_id_t thread_id = vdo_get_callback_thread_id(); + + ASSERT_LOG_ONLY((expected == thread_id), + "vio I/O for physical block %llu on thread %u, should be on bio zone thread %u", + (unsigned long long) vio->physical, + thread_id, + expected); } -/** - * Check whether a vio is a read. - * - * @param vio The vio - * - * @return true if the vio is a read - **/ -static inline bool is_read_vio(const struct vio *vio) -{ - return ((vio->operation & VIO_READ_WRITE_MASK) == VIO_READ); -} +int __must_check create_multi_block_metadata_vio(struct vdo *vdo, + enum vio_type vio_type, + enum vio_priority priority, + void *parent, + unsigned int block_count, + char *data, + struct vio **vio_ptr); -/** - * Check whether a vio is a read-modify-write. - * - * @param vio The vio - * - * @return true if the vio is a read-modify-write - **/ -static inline bool is_read_modify_write_vio(const struct vio *vio) +static inline int __must_check +create_metadata_vio(struct vdo *vdo, + enum vio_type vio_type, + enum vio_priority priority, + void *parent, + char *data, + struct vio **vio_ptr) { - return ((vio->operation & VIO_READ_WRITE_MASK) == - VIO_READ_MODIFY_WRITE); + return create_multi_block_metadata_vio(vdo, + vio_type, + priority, + parent, + 1, + data, + vio_ptr); } +void free_vio(struct vio *vio); + /** - * Check whether a vio is a empty flush. - * - * @param vio The vio - * - * @return true if the vio is a pure, empty flush - **/ -static inline bool is_empty_flush_vio(const struct vio *vio) + * initialize_vio() - Initialize a vio. + * @vio: The vio to initialize. + * @bio: The bio this vio should use for its I/O. + * @block_count: The size of this vio in vdo blocks. + * @vio_type: The vio type. + * @priority: The relative priority of the vio. + * @vdo: The vdo for this vio. + */ +static inline void initialize_vio(struct vio *vio, + struct bio *bio, + unsigned int block_count, + enum vio_type vio_type, + enum vio_priority priority, + struct vdo *vdo) { - return (vio->operation == VIO_FLUSH_BEFORE); + vio->bio = bio; + vio->block_count = block_count; + vio->type = vio_type; + vio->priority = priority; + vdo_initialize_completion(vio_as_completion(vio), vdo, VIO_COMPLETION); } +void update_vio_error_stats(struct vio *vio, const char *format, ...) + __attribute__((format(printf, 2, 3))); + /** - * Check whether a vio is a write. - * - * @param vio The vio - * - * @return true if the vio is a write - **/ -static inline bool is_write_vio(const struct vio *vio) + * is_data_vio() - Check whether a vio is servicing an external data request. + * @vio: The vio to check. + */ +static inline bool is_data_vio(struct vio *vio) { - return ((vio->operation & VIO_READ_WRITE_MASK) == VIO_WRITE); + return vdo_is_data_vio_type(vio->type); } /** - * Check whether a vio requires a flush before doing its I/O. - * - * @param vio The vio - * - * @return true if the vio requires a flush before - **/ -static inline bool vio_requires_flush_before(const struct vio *vio) + * is_metadata_vio() - Check whether a vio is for metadata + * @vio: The vio to check. + */ +static inline bool is_metadata_vio(struct vio *vio) { - return ((vio->operation & VIO_FLUSH_BEFORE) == VIO_FLUSH_BEFORE); + return vdo_is_metadata_vio_type(vio->type); } /** - * Check whether a vio requires a flush after doing its I/O. - * - * @param vio The vio + * get_metadata_priority() - Convert a vio's priority to a work item priority. + * @vio: The vio. * - * @return true if the vio requires a flush after - **/ -static inline bool vio_requires_flush_after(const struct vio *vio) + * Return: The priority with which to submit the vio's bio. + */ +static inline enum vdo_completion_priority +get_metadata_priority(struct vio *vio) { - return ((vio->operation & VIO_FLUSH_AFTER) == VIO_FLUSH_AFTER); + return ((vio->priority == VIO_PRIORITY_HIGH) + ? BIO_Q_HIGH_PRIORITY : BIO_Q_METADATA_PRIORITY); } /** - * Launch a metadata vio. + * prepare_vio_for_io() - Reset a vio's bio to prepare for issuing I/O. + * @vio: The vio preparing to issue I/O. + * @data: The buffer the bio should wrap. + * @callback: The callback the bio should call when IO finishes. + * @bi_opf: The operation and flags for the bio. * - * @param vio The vio to launch - * @param physical The physical block number to read or write - * @param callback The function to call when the vio completes its I/O - * @param error_handler The handler for write errors - * @param operation The operation to perform (read or write) - **/ -void launch_metadata_vio(struct vio *vio, - physical_block_number_t physical, - vdo_action *callback, - vdo_action *error_handler, - enum vio_operation operation); - -/** - * Launch a metadata read vio. + * The pbn to which the I/O will be directed is taken from the 'physical' + * field of the vio. * - * @param vio The vio to launch - * @param physical The physical block number to read - * @param callback The function to call when the vio completes its read - * @param error_handler The handler for write errors - **/ -static inline void launch_read_metadata_vio(struct vio *vio, - physical_block_number_t physical, - vdo_action *callback, - vdo_action *error_handler) + * Return: VDO_SUCCESS or an error. + */ +static inline int __must_check +prepare_vio_for_io(struct vio *vio, + char *data, + bio_end_io_t callback, + unsigned int bi_opf) { - launch_metadata_vio(vio, physical, callback, error_handler, VIO_READ); + return vdo_reset_bio_with_buffer(vio->bio, + data, + vio, + callback, + bi_opf, + vio->physical); } /** - * Launch a metadata write vio. + * continue_vio() - Enqueue a vio to run its next callback. + * @vio: The vio to continue. * - * @param vio The vio to launch - * @param physical The physical block number to write - * @param callback The function to call when the vio completes its write - * @param error_handler The handler for write errors - **/ -static inline void launch_write_metadata_vio(struct vio *vio, - physical_block_number_t physical, - vdo_action *callback, - vdo_action *error_handler) + * Return: The result of the current operation. + */ +static inline void continue_vio(struct vio *vio, int result) { - launch_metadata_vio(vio, physical, callback, error_handler, VIO_WRITE); -} + struct vdo_completion *completion = vio_as_completion(vio); -/** - * Launch a metadata write vio optionally flushing the layer before and/or - * after the write operation. - * - * @param vio The vio to launch - * @param physical The physical block number to write - * @param callback The function to call when the vio completes its - * operation - * @param error_handler The handler for flush or write errors - * @param flush_before Whether or not to flush before writing - * @param flush_after Whether or not to flush after writing - **/ -static inline void -launch_write_metadata_vio_with_flush(struct vio *vio, - physical_block_number_t physical, - vdo_action *callback, - vdo_action *error_handler, - bool flush_before, - bool flush_after) -{ - launch_metadata_vio(vio, - physical, - callback, - error_handler, - (VIO_WRITE | (flush_before ? VIO_FLUSH_BEFORE : 0) | - (flush_after ? VIO_FLUSH_AFTER : 0))); + if (unlikely(result != VDO_SUCCESS)) { + vdo_set_completion_result(vio_as_completion(vio), result); + } + + vdo_enqueue_completion(completion); } /** - * Issue a flush to the layer. - * - * @param vio The vio to notify when the flush is complete - * @param callback The function to call when the flush is complete - * @param error_handler The handler for flush errors - **/ -static inline void launch_flush_vio(struct vio *vio, - vdo_action *callback, - vdo_action *error_handler) + * continue_vio_after_io() - Continue a vio now that its I/O has returned. + */ +static inline void continue_vio_after_io(struct vio *vio, + vdo_action *callback, + thread_id_t thread) { - launch_metadata_vio(vio, 0, callback, error_handler, - VIO_FLUSH_BEFORE); + vdo_count_completed_bios(vio->bio); + vdo_set_completion_callback(vio_as_completion(vio), callback, thread); + continue_vio(vio, vdo_get_bio_result(vio->bio)); } -/** - * Read or write a single metadata vio. - * - * @param vio The vio to read or write - **/ -void submit_metadata_vio(struct vio *vio); - -/** - * A function to write a single compressed block to the layer - * - * @param vio The compressed write vio to write - **/ -void write_compressed_block_vio(struct vio *vio); +void record_metadata_io_error(struct vio *vio); -#endif // VIO_H +#endif /* VIO_H */ diff --git a/vdo/vioPool.h b/vdo/vioPool.h deleted file mode 100644 index 662aa277..00000000 --- a/vdo/vioPool.h +++ /dev/null @@ -1,125 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/vioPool.h#7 $ - */ - -#ifndef VIO_POOL_H -#define VIO_POOL_H - -#include - -#include "permassert.h" - -#include "completion.h" -#include "types.h" -#include "waitQueue.h" - -/** - * A vio_pool is a collection of preallocated vios used to write arbitrary - * metadata blocks. - **/ - -/** - * A vio_pool_entry is the pair of vio and buffer whether in use or not. - **/ -struct vio_pool_entry { - struct list_head available_entry; - struct vio *vio; - void *buffer; - void *parent; - void *context; -}; - -/** - * A function which constructs a vio for a pool. - * - * @param [in] vdo The vdo in which the vio will operate - * @param [in] parent The parent of the vio - * @param [in] buffer The data buffer for the vio - * @param [out] vio_ptr A pointer to hold the new vio - **/ -typedef int vio_constructor(struct vdo *vdo, - void *parent, - void *buffer, - struct vio **vio_ptr); - -/** - * Create a new vio pool. - * - * @param [in] vdo the vdo - * @param [in] pool_size the number of vios in the pool - * @param [in] thread_id the ID of the thread using this pool - * @param [in] constructor the constructor for vios in the pool - * @param [in] context the context that each entry will have - * @param [out] pool_ptr the resulting pool - * - * @return a success or error code - **/ -int __must_check make_vio_pool(struct vdo *vdo, - size_t pool_size, - thread_id_t thread_id, - vio_constructor *constructor, - void *context, - struct vio_pool **pool_ptr); - -/** - * Destroy a vio pool - * - * @param pool the pool to free - **/ -void free_vio_pool(struct vio_pool *pool); - -/** - * Check whether an vio pool has outstanding entries. - * - * @return true if the pool is busy - **/ -bool __must_check is_vio_pool_busy(struct vio_pool *pool); - -/** - * Acquire a vio and buffer from the pool (asynchronous). - * - * @param pool the vio pool - * @param waiter object that is requesting a vio - * - * @return VDO_SUCCESS or an error - **/ -int acquire_vio_from_pool(struct vio_pool *pool, struct waiter *waiter); - -/** - * Return a vio and its buffer to the pool. - * - * @param pool the vio pool - * @param entry a vio pool entry - **/ -void return_vio_to_pool(struct vio_pool *pool, struct vio_pool_entry *entry); - -/** - * Convert a list entry to the vio_pool_entry that contains it. - * - * @param entry The list entry to convert - * - * @return The vio_pool_entry wrapping the list entry - **/ -static inline struct vio_pool_entry *as_vio_pool_entry(struct list_head *entry) -{ - return list_entry(entry, struct vio_pool_entry, available_entry); -} - -#endif // VIO_POOL_H diff --git a/vdo/vioRead.c b/vdo/vioRead.c deleted file mode 100644 index d26f4407..00000000 --- a/vdo/vioRead.c +++ /dev/null @@ -1,140 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/vioRead.c#22 $ - */ - -#include "vioRead.h" - -#include "logger.h" - -#include "blockMap.h" -#include "dataVIO.h" -#include "vdoInternal.h" -#include "vioWrite.h" - -/** - * Do the modify-write part of a read-modify-write cycle. This callback is - * registered in read_block(). - * - * @param completion The data_vio which has just finished its read - **/ -static void modify_for_partial_write(struct vdo_completion *completion) -{ - struct data_vio *data_vio = as_data_vio(completion); - struct vio *vio = data_vio_as_vio(data_vio); - - assert_data_vio_in_logical_zone(data_vio); - - if (completion->result != VDO_SUCCESS) { - complete_data_vio(completion); - return; - } - - vdo_apply_partial_write(data_vio); - vio->operation = VIO_WRITE | (vio->operation & ~VIO_READ_WRITE_MASK); - data_vio->is_partial_write = true; - launch_write_data_vio(data_vio); -} - -/** - * Read a block asynchronously. This is the callback registered in - * read_block_mapping(). - * - * @param completion The data_vio to read - **/ -static void read_block(struct vdo_completion *completion) -{ - struct data_vio *data_vio = as_data_vio(completion); - struct vio *vio = as_vio(completion); - - if (completion->result != VDO_SUCCESS) { - complete_data_vio(completion); - return; - } - - completion->callback = - (is_read_vio(vio) ? complete_data_vio - : modify_for_partial_write); - - if (data_vio->mapped.pbn == VDO_ZERO_BLOCK) { - zero_data_vio(data_vio); - invoke_vdo_completion_callback(completion); - return; - } - - vio->physical = data_vio->mapped.pbn; - data_vio->last_async_operation = VIO_ASYNC_OP_READ_DATA_VIO; - read_data_vio(data_vio); -} - -/** - * Read the data_vio's mapping from the block map. This callback is registered - * in launch_read_data_vio(). - * - * @param completion The data_vio to be read - **/ -static void read_block_mapping(struct vdo_completion *completion) -{ - struct data_vio *data_vio = as_data_vio(completion); - - if (completion->result != VDO_SUCCESS) { - complete_data_vio(completion); - return; - } - - assert_data_vio_in_logical_zone(data_vio); - set_data_vio_logical_callback(data_vio, read_block); - data_vio->last_async_operation = VIO_ASYNC_OP_GET_MAPPED_BLOCK_FOR_READ; - vdo_get_mapped_block(data_vio); -} - -/**********************************************************************/ -void launch_read_data_vio(struct data_vio *data_vio) -{ - assert_data_vio_in_logical_zone(data_vio); - data_vio->last_async_operation = VIO_ASYNC_OP_FIND_BLOCK_MAP_SLOT; - // Go find the block map slot for the LBN mapping. - vdo_find_block_map_slot(data_vio, - read_block_mapping, - get_vdo_logical_zone_thread_id(data_vio->logical.zone)); -} - -/** - * Release the logical block lock which a read data_vio obtained now that it - * is done. - * - * @param completion The data_vio - **/ -static void release_logical_lock(struct vdo_completion *completion) -{ - struct data_vio *data_vio = as_data_vio(completion); - assert_data_vio_in_logical_zone(data_vio); - vdo_release_logical_block_lock(data_vio); - vio_done_callback(completion); -} - -/** - * Clean up a data_vio which has finished processing a read. - * - * @param data_vio The data_vio to clean up - **/ -void cleanup_read_data_vio(struct data_vio *data_vio) -{ - launch_data_vio_logical_callback(data_vio, release_logical_lock); -} diff --git a/vdo/vioRead.h b/vdo/vioRead.h deleted file mode 100644 index b40ed788..00000000 --- a/vdo/vioRead.h +++ /dev/null @@ -1,43 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/vioRead.h#5 $ - */ - -#ifndef VIO_READ_H -#define VIO_READ_H - -#include "types.h" - -/** - * Start the asynchronous processing of the data_vio for a read or - * read-modify-write request which has acquired a lock on its logical block. - * The first step is to perform a block map lookup. - * - * @param data_vio The data_vio doing the read - **/ -void launch_read_data_vio(struct data_vio *data_vio); - -/** - * Clean up a data_vio which has finished processing a read. - * - * @param data_vio The data_vio to clean up - **/ -void cleanup_read_data_vio(struct data_vio *data_vio); - -#endif /* VIO_READ_H */ diff --git a/vdo/vioWrite.h b/vdo/vioWrite.h deleted file mode 100644 index 3a88ec41..00000000 --- a/vdo/vioWrite.h +++ /dev/null @@ -1,51 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/vioWrite.h#8 $ - */ - -#ifndef VIO_WRITE_H -#define VIO_WRITE_H - -#include "types.h" - -/** - * Start the asynchronous processing of a data_vio for a write request which has - * acquired a lock on its logical block by joining the current flush generation - * and then attempting to allocate a physical block. - * - * @param data_vio The data_vio doing the write - **/ -void launch_write_data_vio(struct data_vio *data_vio); - -/** - * Clean up a data_vio which has finished processing a write. - * - * @param data_vio The data_vio to clean up - **/ -void cleanup_write_data_vio(struct data_vio *data_vio); - -/** - * Continue a write by attempting to compress the data. This is a re-entry - * point to vio_write used by hash locks. - * - * @param data_vio The data_vio to be compressed - **/ -void vio_compress_data(struct data_vio *data_vio); - -#endif /* VIO_WRITE_H */ diff --git a/vdo/volumeGeometry.c b/vdo/volume-geometry.c similarity index 57% rename from vdo/volumeGeometry.c rename to vdo/volume-geometry.c index 1c9f5b74..db9cc369 100644 --- a/vdo/volumeGeometry.c +++ b/vdo/volume-geometry.c @@ -1,39 +1,23 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/volumeGeometry.c#31 $ */ -#include "volumeGeometry.h" +#include "volume-geometry.h" #include "buffer.h" #include "logger.h" -#include "memoryAlloc.h" +#include "memory-alloc.h" #include "numeric.h" #include "permassert.h" #include "bio.h" -#include "checksum.h" #include "constants.h" #include "header.h" -#include "releaseVersions.h" -#include "statusCodes.h" +#include "release-versions.h" +#include "status-codes.h" #include "types.h" +#include "vdo.h" enum { MAGIC_NUMBER_SIZE = 8, @@ -43,7 +27,7 @@ enum { struct geometry_block { char magic_number[MAGIC_NUMBER_SIZE]; struct header header; - crc32_checksum_t checksum; + uint32_t checksum; } __packed; static const struct header GEOMETRY_BLOCK_HEADER_5_0 = { @@ -52,8 +36,10 @@ static const struct header GEOMETRY_BLOCK_HEADER_5_0 = { .major_version = 5, .minor_version = 0, }, - // Note: this size isn't just the payload size following the header, - // like it is everywhere else in VDO. + /* + * Note: this size isn't just the payload size following the header, + * like it is everywhere else in VDO. + */ .size = sizeof(struct geometry_block) + sizeof(struct volume_geometry), }; @@ -63,8 +49,10 @@ static const struct header GEOMETRY_BLOCK_HEADER_4_0 = { .major_version = 4, .minor_version = 0, }, - // Note: this size isn't just the payload size following the header, - // like it is everywhere else in VDO. + /* + * Note: this size isn't just the payload size following the header, + * like it is everywhere else in VDO. + */ .size = sizeof(struct geometry_block) + sizeof(struct volume_geometry_4_0), }; @@ -77,21 +65,22 @@ static const release_version_number_t COMPATIBLE_RELEASE_VERSIONS[] = { }; /** - * Determine whether the supplied release version can be understood by - * the VDO code. + * is_loadable_release_version() - Determine whether the supplied + * release version can be understood + * by the VDO code. + * @version: The release version number to check. * - * @param version The release version number to check - * - * @return True if the given version can be loaded. - **/ + * Return: True if the given version can be loaded. + */ static inline bool is_loadable_release_version(release_version_number_t version) { unsigned int i; + if (version == VDO_CURRENT_RELEASE_VERSION_NUMBER) { return true; } - for (i = 0; i < COUNT_OF(COMPATIBLE_RELEASE_VERSIONS); i++) { + for (i = 0; i < ARRAY_SIZE(COMPATIBLE_RELEASE_VERSIONS); i++) { if (version == COMPATIBLE_RELEASE_VERSIONS[i]) { return true; } @@ -101,25 +90,25 @@ static inline bool is_loadable_release_version(release_version_number_t version) } /** - * Decode the on-disk representation of an index configuration from a buffer. - * - * @param buffer A buffer positioned at the start of the encoding - * @param config The structure to receive the decoded fields + * decode_index_config() - Decode the on-disk representation of an + * index configuration from a buffer. + * @buffer: A buffer positioned at the start of the encoding. + * @config: The structure to receive the decoded fields. * - * @return UDS_SUCCESS or an error - **/ + * Return: UDS_SUCCESS or an error. + */ static int decode_index_config(struct buffer *buffer, struct index_config *config) { uint32_t mem; - uint32_t checkpoint_frequency; bool sparse; int result = get_uint32_le_from_buffer(buffer, &mem); + if (result != VDO_SUCCESS) { return result; } - result = get_uint32_le_from_buffer(buffer, &checkpoint_frequency); + result = skip_forward(buffer, sizeof(uint32_t)); if (result != VDO_SUCCESS) { return result; } @@ -131,7 +120,6 @@ static int decode_index_config(struct buffer *buffer, *config = (struct index_config) { .mem = mem, - .checkpoint_frequency = checkpoint_frequency, .sparse = sparse, }; return VDO_SUCCESS; @@ -139,19 +127,20 @@ static int decode_index_config(struct buffer *buffer, /** - * Decode the on-disk representation of a volume region from a buffer. + * decode_volume_region() - Decode the on-disk representation of a + * volume region from a buffer. + * @buffer: A buffer positioned at the start of the encoding. + * @region: The structure to receive the decoded fields. * - * @param buffer A buffer positioned at the start of the encoding - * @param region The structure to receive the decoded fields - * - * @return UDS_SUCCESS or an error - **/ + * Return: UDS_SUCCESS or an error. + */ static int decode_volume_region(struct buffer *buffer, struct volume_region *region) { physical_block_number_t start_block; enum volume_region_id id; int result = get_uint32_le_from_buffer(buffer, &id); + if (result != VDO_SUCCESS) { return result; } @@ -170,14 +159,14 @@ static int decode_volume_region(struct buffer *buffer, /** - * Decode the on-disk representation of a volume geometry from a buffer. - * - * @param buffer A buffer positioned at the start of the encoding - * @param geometry The structure to receive the decoded fields - * @param version The geometry block version to decode + * decode_volume_geometry() - Decode the on-disk representation of a + * volume geometry from a buffer. + * @buffer: A buffer positioned at the start of the encoding. + * @geometry: The structure to receive the decoded fields. + * @version: The geometry block version to decode. * - * @return UDS_SUCCESS or an error - **/ + * Return: UDS_SUCCESS or an error. + */ static int decode_volume_geometry(struct buffer *buffer, struct volume_geometry *geometry, uint32_t version) @@ -187,6 +176,7 @@ static int decode_volume_geometry(struct buffer *buffer, nonce_t nonce; block_count_t bio_offset; int result = get_uint32_le_from_buffer(buffer, &release_version); + if (result != VDO_SUCCESS) { return result; } @@ -214,7 +204,7 @@ static int decode_volume_geometry(struct buffer *buffer, } geometry->bio_offset = bio_offset; - for (id = 0; id < VOLUME_REGION_COUNT; id++) { + for (id = 0; id < VDO_VOLUME_REGION_COUNT; id++) { result = decode_volume_region(buffer, &geometry->regions[id]); if (result != VDO_SUCCESS) { return result; @@ -226,14 +216,14 @@ static int decode_volume_geometry(struct buffer *buffer, /** - * Decode the on-disk representation of a geometry block, up to but not - * including the checksum, from a buffer. + * decode_geometry_block() - Decode the on-disk representation of a + * geometry block, up to but not including + * the checksum, from a buffer. + * @buffer: A buffer positioned at the start of the block. + * @geometry: The structure to receive the decoded volume geometry fields. * - * @param buffer A buffer positioned at the start of the block - * @param geometry The structure to receive the decoded volume geometry fields - * - * @return UDS_SUCCESS or an error - **/ + * Return: UDS_SUCCESS or an error. + */ static int decode_geometry_block(struct buffer *buffer, struct volume_geometry *geometry) { @@ -249,16 +239,16 @@ static int decode_geometry_block(struct buffer *buffer, return result; } - result = decode_vdo_header(buffer, &header); + result = vdo_decode_header(buffer, &header); if (result != VDO_SUCCESS) { return result; } if (header.version.major_version <= 4) { - result = validate_vdo_header(&GEOMETRY_BLOCK_HEADER_4_0, + result = vdo_validate_header(&GEOMETRY_BLOCK_HEADER_4_0, &header, true, __func__); } else { - result = validate_vdo_header(&GEOMETRY_BLOCK_HEADER_5_0, + result = vdo_validate_header(&GEOMETRY_BLOCK_HEADER_5_0, &header, true, __func__); } if (result != VDO_SUCCESS) { @@ -271,22 +261,21 @@ static int decode_geometry_block(struct buffer *buffer, return result; } - // Leave the CRC for the caller to decode and verify. + /* Leave the CRC for the caller to decode and verify. */ return ASSERT(header.size == (uncompacted_amount(buffer) + - sizeof(crc32_checksum_t)), + sizeof(uint32_t)), "should have decoded up to the geometry checksum"); } /** - * Decode and validate an encoded geometry block. - * - * @param block The encoded geometry block - * @param geometry The structure to receive the decoded fields - **/ + * vdo_parse_geometry_block() - Decode and validate an encoded geometry block. + * @block: The encoded geometry block. + * @geometry: The structure to receive the decoded fields. + */ static int __must_check vdo_parse_geometry_block(byte *block, struct volume_geometry *geometry) { - crc32_checksum_t checksum, saved_checksum; + uint32_t checksum, saved_checksum; struct buffer *buffer; int result; @@ -301,16 +290,15 @@ vdo_parse_geometry_block(byte *block, struct volume_geometry *geometry) return result; } - // Checksum everything decoded so far. - checksum = vdo_update_crc32(VDO_INITIAL_CHECKSUM, block, - uncompacted_amount(buffer)); + /* Checksum everything decoded so far. */ + checksum = vdo_crc32(block, uncompacted_amount(buffer)); result = get_uint32_le_from_buffer(buffer, &saved_checksum); if (result != VDO_SUCCESS) { free_buffer(UDS_FORGET(buffer)); return result; } - // Finished all decoding. Everything that follows is validation code. + /* Finished all decoding. Everything that follows is validation code. */ free_buffer(UDS_FORGET(buffer)); if (!is_loadable_release_version(geometry->release_version)) { @@ -323,13 +311,21 @@ vdo_parse_geometry_block(byte *block, struct volume_geometry *geometry) VDO_CHECKSUM_MISMATCH); } -/**********************************************************************/ +/** + * vdo_read_geometry_block() - Synchronously read a geometry block from a + * block device. + * @bdev: The block device containing the block to read. + * @geometry: A volume_geometry to read into. + * + * Return: VDO_SUCCESS or an error code. + */ int vdo_read_geometry_block(struct block_device *bdev, struct volume_geometry *geometry) { struct bio *bio; byte *block; int result = UDS_ALLOCATE(VDO_BLOCK_SIZE, byte, __func__, &block); + if (result != VDO_SUCCESS) { return result; } @@ -345,7 +341,7 @@ int vdo_read_geometry_block(struct block_device *bdev, NULL, NULL, REQ_OP_READ, - GEOMETRY_BLOCK_LOCATION); + VDO_GEOMETRY_BLOCK_LOCATION); if (result != VDO_SUCCESS) { vdo_free_bio(bio); UDS_FREE(block); @@ -368,28 +364,3 @@ int vdo_read_geometry_block(struct block_device *bdev, return result; } - -/************************************************************************/ -int -vdo_index_config_to_uds_configuration(const struct index_config *index_config, - struct uds_configuration **uds_config_ptr) -{ - struct uds_configuration *uds_configuration; - int result = uds_initialize_configuration(&uds_configuration, - index_config->mem); - if (result != UDS_SUCCESS) { - return uds_log_error_strerror(result, - "error initializing configuration"); - } - - uds_configuration_set_sparse(uds_configuration, index_config->sparse); - *uds_config_ptr = uds_configuration; - return VDO_SUCCESS; -} - -/************************************************************************/ -void vdo_index_config_to_uds_parameters(const struct index_config *index_config, - struct uds_parameters *user_params) -{ - user_params->checkpoint_frequency = index_config->checkpoint_frequency; -} diff --git a/vdo/volume-geometry.h b/vdo/volume-geometry.h new file mode 100644 index 00000000..c8520d49 --- /dev/null +++ b/vdo/volume-geometry.h @@ -0,0 +1,117 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright Red Hat + */ + +#ifndef VOLUME_GEOMETRY_H +#define VOLUME_GEOMETRY_H + + +#include +#include + +#include "uds.h" + +#include "kernel-types.h" +#include "types.h" + +enum { + VDO_GEOMETRY_BLOCK_LOCATION = 0, +}; + +struct index_config { + uint32_t mem; + uint32_t unused; + bool sparse; +} __packed; + +enum volume_region_id { + VDO_INDEX_REGION = 0, + VDO_DATA_REGION = 1, + VDO_VOLUME_REGION_COUNT, +}; + +struct volume_region { + /* The ID of the region */ + enum volume_region_id id; + /* + * The absolute starting offset on the device. The region continues + * until the next region begins. + */ + physical_block_number_t start_block; +} __packed; + +struct volume_geometry { + /* The release version number of this volume */ + release_version_number_t release_version; + /* The nonce of this volume */ + nonce_t nonce; + /* The uuid of this volume */ + uuid_t uuid; + /* The block offset to be applied to bios */ + block_count_t bio_offset; + /* The regions in ID order */ + struct volume_region regions[VDO_VOLUME_REGION_COUNT]; + /* The index config */ + struct index_config index_config; +} __packed; + +/* This volume geometry struct is used for sizing only */ +struct volume_geometry_4_0 { + /* The release version number of this volume */ + release_version_number_t release_version; + /* The nonce of this volume */ + nonce_t nonce; + /* The uuid of this volume */ + uuid_t uuid; + /* The regions in ID order */ + struct volume_region regions[VDO_VOLUME_REGION_COUNT]; + /* The index config */ + struct index_config index_config; +} __packed; + +/** + * vdo_get_index_region_start() - Get the start of the index region from a + * geometry. + * @geometry: The geometry. + * + * Return: The start of the index region. + */ +static inline physical_block_number_t __must_check +vdo_get_index_region_start(struct volume_geometry geometry) +{ + return geometry.regions[VDO_INDEX_REGION].start_block; +} + +/** + * vdo_get_data_region_start() - Get the start of the data region from a + * geometry. + * @geometry: The geometry. + * + * Return: The start of the data region. + */ +static inline physical_block_number_t __must_check +vdo_get_data_region_start(struct volume_geometry geometry) +{ + return geometry.regions[VDO_DATA_REGION].start_block; +} + +/** + * vdo_get_index_region_size() - Get the size of the index region from a + * geometry. + * @geometry: The geometry. + * + * Return: The size of the index region. + */ +static inline physical_block_number_t __must_check +vdo_get_index_region_size(struct volume_geometry geometry) +{ + return vdo_get_data_region_start(geometry) - + vdo_get_index_region_start(geometry); +} + +int __must_check +vdo_read_geometry_block(struct block_device *bdev, + struct volume_geometry *geometry); + +#endif /* VOLUME_GEOMETRY_H */ diff --git a/vdo/volume-index-ops.c b/vdo/volume-index-ops.c new file mode 100644 index 00000000..44964f00 --- /dev/null +++ b/vdo/volume-index-ops.c @@ -0,0 +1,135 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright Red Hat + */ +#include "volume-index-ops.h" + +#include "compiler.h" +#include "config.h" +#include "errors.h" +#include "geometry.h" +#include "logger.h" +#include "volume-index005.h" +#include "volume-index006.h" +#include "memory-alloc.h" +#include "permassert.h" +#include "uds.h" + +static INLINE bool uses_sparse(const struct configuration *config) +{ + return is_sparse_geometry(config->geometry); +} + +void get_volume_index_combined_stats(const struct volume_index *volume_index, + struct volume_index_stats *stats) +{ + struct volume_index_stats dense, sparse; + + get_volume_index_stats(volume_index, &dense, &sparse); + stats->memory_allocated = + dense.memory_allocated + sparse.memory_allocated; + stats->rebalance_time = dense.rebalance_time + sparse.rebalance_time; + stats->rebalance_count = + dense.rebalance_count + sparse.rebalance_count; + stats->record_count = dense.record_count + sparse.record_count; + stats->collision_count = + dense.collision_count + sparse.collision_count; + stats->discard_count = dense.discard_count + sparse.discard_count; + stats->overflow_count = dense.overflow_count + sparse.overflow_count; + stats->num_lists = dense.num_lists + sparse.num_lists; + stats->early_flushes = dense.early_flushes + sparse.early_flushes; +} + +int make_volume_index(const struct configuration *config, + uint64_t volume_nonce, + struct volume_index **volume_index) +{ + if (uses_sparse(config)) { + return make_volume_index006(config, volume_nonce, + volume_index); + } else { + return make_volume_index005(config, volume_nonce, + volume_index); + } +} + +int compute_volume_index_save_blocks(const struct configuration *config, + size_t block_size, + uint64_t *block_count) +{ + size_t num_bytes; + int result = (uses_sparse(config) ? + compute_volume_index_save_bytes006(config, + &num_bytes) : + compute_volume_index_save_bytes005(config, + &num_bytes)); + if (result != UDS_SUCCESS) { + return result; + } + num_bytes += sizeof(struct delta_list_save_info); + *block_count = DIV_ROUND_UP(num_bytes, block_size) + MAX_ZONES; + return UDS_SUCCESS; +} + +int save_volume_index(struct volume_index *volume_index, + struct buffered_writer **writers, + unsigned int num_writers) +{ + int result = UDS_SUCCESS; + unsigned int zone; + + for (zone = 0; zone < num_writers; ++zone) { + result = start_saving_volume_index(volume_index, + zone, + writers[zone]); + if (result != UDS_SUCCESS) { + break; + } + + result = finish_saving_volume_index(volume_index, zone); + if (result != UDS_SUCCESS) { + break; + } + + result = write_guard_delta_list(writers[zone]); + if (result != UDS_SUCCESS) { + break; + } + + result = flush_buffered_writer(writers[zone]); + if (result != UDS_SUCCESS) { + break; + } + } + + return result; +} + +int load_volume_index(struct volume_index *volume_index, + struct buffered_reader **readers, + unsigned int num_readers) +{ + /* Start by reading the "header" section of the stream */ + int result = start_restoring_volume_index(volume_index, + readers, + num_readers); + if (result != UDS_SUCCESS) { + return result; + } + + result = finish_restoring_volume_index(volume_index, + readers, + num_readers); + if (result != UDS_SUCCESS) { + abort_restoring_volume_index(volume_index); + return result; + } + + /* Check the final guard lists to make sure we read everything. */ + result = check_guard_delta_lists(readers, num_readers); + if (result != UDS_SUCCESS) { + abort_restoring_volume_index(volume_index); + } + + return result; +} diff --git a/uds/volumeIndexOps.h b/vdo/volume-index-ops.h similarity index 62% rename from uds/volumeIndexOps.h rename to vdo/volume-index-ops.h index ccd3bb9d..c3329e32 100644 --- a/uds/volumeIndexOps.h +++ b/vdo/volume-index-ops.h @@ -1,65 +1,29 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/krusty/src/uds/volumeIndexOps.h#4 $ */ #ifndef VOLUMEINDEXOPS_H #define VOLUMEINDEXOPS_H 1 #include "compiler.h" -#include "deltaIndex.h" -#include "indexComponent.h" -#include "indexConfig.h" +#include "config.h" +#include "delta-index.h" #include "uds-threads.h" #include "uds.h" -extern const struct index_component_info *const VOLUME_INDEX_INFO; extern unsigned int min_volume_index_delta_lists; struct volume_index_stats { - size_t memory_allocated; // Number of bytes allocated - ktime_t rebalance_time; // Nanoseconds spent rebalancing - int rebalance_count; // Number of memory rebalances - long record_count; // The number of records in the index - long collision_count; // The number of collision records - long discard_count; // The number of records removed - long overflow_count; // The number of UDS_OVERFLOWs detected - unsigned int num_lists; // The number of delta lists - long early_flushes; // Number of early flushes -}; - -/* - * The volume_index_triage structure is used by lookup_volume_index_name(), - * which is a read-only operation that looks at the chunk name and returns - * some information used by the index to select the thread/queue/code_path - * that will process the chunk. - */ -struct volume_index_triage { - uint64_t virtual_chapter; // If in_sampled_chapter is true, then this - // is the chapter containing the entry for - // the chunk name - unsigned int zone; // The zone containing the chunk name - bool is_sample; // If true, this chunk name belongs to the - // sampled index - bool in_sampled_chapter; // If true, this chunk already has an entry - // in the sampled index and virtual_chapter - // is valid + size_t memory_allocated; /* Number of bytes allocated */ + ktime_t rebalance_time; /* Nanoseconds spent rebalancing */ + int rebalance_count; /* Number of memory rebalances */ + long record_count; /* The number of records in the index */ + long collision_count; /* The number of collision records */ + long discard_count; /* The number of records removed */ + long overflow_count; /* The number of UDS_OVERFLOWs detected */ + unsigned int num_lists; /* The number of delta lists */ + long early_flushes; /* Number of early flushes */ }; /* @@ -72,35 +36,35 @@ struct volume_index_triage { * a volume index record. */ struct volume_index_record { - // Public fields - uint64_t virtual_chapter; // Chapter where the block info is found - bool is_collision; // This record is a collision - bool is_found; // This record is the block searched for - - // Private fields - unsigned char magic; // The magic number for valid - // records - unsigned int zone_number; // Zone that contains this block - struct volume_index *volume_index; // The volume index - struct mutex *mutex; // Mutex that must be held while - // accessing this delta index - // entry; used only for a - // sampled index; otherwise is - // NULL - const struct uds_chunk_name *name; // The blockname to which this - // record refers - struct delta_index_entry delta_entry; // The delta index entry for - // this record + /* Public fields */ + uint64_t virtual_chapter; /* Chapter where the block info is found */ + bool is_collision; /* This record is a collision */ + bool is_found; /* This record is the block searched for */ + + /* Private fields */ + unsigned char magic; /* The magic number for valid */ + /* records */ + unsigned int zone_number; /* Zone that contains this block */ + struct volume_index *volume_index; /* The volume index */ + struct mutex *mutex; /* Mutex that must be held while */ + /* accessing this delta index */ + /* entry; used only for a */ + /* sampled index; otherwise is */ + /* NULL */ + const struct uds_chunk_name *name; /* The blockname to which this */ + /* record refers */ + struct delta_index_entry delta_entry; /* The delta index entry for */ + /* this record */ }; struct volume_index { void (*abort_restoring_volume_index)(struct volume_index *volume_index); - int (*abort_saving_volume_index)(const struct volume_index *volume_index, - unsigned int zone_number); + int (*finish_restoring_volume_index)(struct volume_index *volume_index, + struct buffered_reader **buffered_readers, + unsigned int num_readers); int (*finish_saving_volume_index)(const struct volume_index *volume_index, unsigned int zone_number); void (*free_volume_index)(struct volume_index *volume_index); - size_t (*get_volume_index_memory_used)(const struct volume_index *volume_index); int (*get_volume_index_record)(struct volume_index *volume_index, const struct uds_chunk_name *name, struct volume_index_record *record); @@ -111,18 +75,10 @@ struct volume_index { const struct uds_chunk_name *name); bool (*is_volume_index_sample)(const struct volume_index *volume_index, const struct uds_chunk_name *name); - bool (*is_restoring_volume_index_done)(const struct volume_index *volume_index); - bool (*is_saving_volume_index_done)(const struct volume_index *volume_index, - unsigned int zone_number); - int (*lookup_volume_index_name)(const struct volume_index *volume_index, - const struct uds_chunk_name *name, - struct volume_index_triage *triage); - int (*lookup_volume_index_sampled_name)(const struct volume_index *volume_index, - const struct uds_chunk_name *name, - struct volume_index_triage *triage); - int (*restore_delta_list_to_volume_index)(struct volume_index *volume_index, - const struct delta_list_save_info *dlsi, - const byte data[DELTA_LIST_MAX_BYTE_COUNT]); + uint64_t (*lookup_volume_index_name)(const struct volume_index *volume_index, + const struct uds_chunk_name *name); + uint64_t (*lookup_volume_index_sampled_name)(const struct volume_index *volume_index, + const struct uds_chunk_name *name); void (*set_volume_index_open_chapter)(struct volume_index *volume_index, uint64_t virtual_chapter); void (*set_volume_index_tag)(struct volume_index *volume_index, @@ -132,7 +88,7 @@ struct volume_index { uint64_t virtual_chapter); int (*start_restoring_volume_index)(struct volume_index *volume_index, struct buffered_reader **buffered_readers, - int num_readers); + unsigned int num_readers); int (*start_saving_volume_index)(const struct volume_index *volume_index, unsigned int zone_number, struct buffered_writer *buffered_writer); @@ -151,14 +107,12 @@ void get_volume_index_combined_stats(const struct volume_index *volume_index, * Make a new volume index. * * @param config The configuration of the volume index - * @param num_zones The number of zones * @param volume_nonce The nonce used to store the index * @param volume_index Location to hold new volume index ptr * * @return error code or UDS_SUCCESS **/ int __must_check make_volume_index(const struct configuration *config, - unsigned int num_zones, uint64_t volume_nonce, struct volume_index **volume_index); @@ -178,17 +132,21 @@ compute_volume_index_save_blocks(const struct configuration *config, uint64_t *block_count); /** - * Restore a volume index. This is exposed for unit tests. + * Restore a volume index. * + * @param volume_index The volume index * @param readers The readers to read from. * @param num_readers The number of readers. - * @param volume_index The volume index * * @return UDS_SUCCESS on success, or an error code on failure **/ -int __must_check restore_volume_index(struct buffered_reader **readers, - unsigned int num_readers, - struct volume_index *volume_index); +int __must_check load_volume_index(struct volume_index *volume_index, + struct buffered_reader **readers, + unsigned int num_readers); + +int __must_check save_volume_index(struct volume_index *volume_index, + struct buffered_writer **writers, + unsigned int num_writers); /** * Abort restoring a volume index from an input stream. @@ -202,20 +160,20 @@ abort_restoring_volume_index(struct volume_index *volume_index) } /** - * Abort saving a volume index to an output stream. If an error occurred - * asynchronously during the save operation, it will be dropped. + * Finish restoring a volume index from an input stream. * - * @param volume_index The volume index - * @param zone_number The number of the zone to save - * - * @return UDS_SUCCESS on success, or an error code on failure + * @param volume_index The volume index to restore into + * @param buffered_readers The buffered readers to read the volume index from + * @param num_readers The number of buffered readers **/ static INLINE int -abort_saving_volume_index(const struct volume_index *volume_index, - unsigned int zone_number) +finish_restoring_volume_index(struct volume_index *volume_index, + struct buffered_reader **buffered_readers, + unsigned int num_readers) { - return volume_index->abort_saving_volume_index(volume_index, - zone_number); + return volume_index->finish_restoring_volume_index(volume_index, + buffered_readers, + num_readers); } /** @@ -243,20 +201,11 @@ finish_saving_volume_index(const struct volume_index *volume_index, **/ static INLINE void free_volume_index(struct volume_index *volume_index) { - volume_index->free_volume_index(volume_index); -} + if (volume_index == NULL) { + return; + } -/** - * Get the number of bytes used for volume index entries. - * - * @param volume_index The volume index - * - * @return The number of bytes in use - **/ -static INLINE size_t -get_volume_index_memory_used(const struct volume_index *volume_index) -{ - return volume_index->get_volume_index_memory_used(volume_index); + volume_index->free_volume_index(volume_index); } /** @@ -337,77 +286,37 @@ is_volume_index_sample(const struct volume_index *volume_index, return volume_index->is_volume_index_sample(volume_index, name); } -/** - * Have all the data been read while restoring a volume index from an input - * stream? - * - * @param volume_index The volume index to restore into - * - * @return true if all the data are read - **/ -static INLINE bool -is_restoring_volume_index_done(const struct volume_index *volume_index) -{ - return volume_index->is_restoring_volume_index_done(volume_index); -} - -/** - * Have all the data been written while saving a volume index to an - * output stream? If the answer is yes, it is still necessary to call - * finish_saving_volume_index(), which will return quickly. - * - * @param volume_index The volume index - * @param zone_number The number of the zone to save - * - * @return true if all the data are written - **/ -static INLINE bool -is_saving_volume_index_done(const struct volume_index *volume_index, - unsigned int zone_number) -{ - return volume_index->is_saving_volume_index_done(volume_index, - zone_number); -} - /** * Do a quick read-only lookup of the chunk name and return information * needed by the index code to process the chunk name. * - * @param volume_index The volume index - * @param name The chunk name - * @param triage Information about the chunk name + * @param volume_index The volume index + * @param name The chunk name * - * @return UDS_SUCCESS or an error code + * @return The sparse virtual chapter, or UINT64_MAX if none **/ -static INLINE int +static INLINE uint64_t lookup_volume_index_name(const struct volume_index *volume_index, - const struct uds_chunk_name *name, - struct volume_index_triage *triage) + const struct uds_chunk_name *name) { - return volume_index->lookup_volume_index_name(volume_index, name, - triage); + return volume_index->lookup_volume_index_name(volume_index, name); } /** * Do a quick read-only lookup of the sampled chunk name and return * information needed by the index code to process the chunk name. * - * @param volume_index The volume index - * @param name The chunk name - * @param triage Information about the chunk name. The zone and - * is_sample fields are already filled in. Set - * in_sampled_chapter and virtual_chapter if the chunk - * name is found in the index. + * @param volume_index The volume index + * @param name The chunk name * - * @return UDS_SUCCESS or an error code + * @return The sparse virtual chapter, or UINT64_MAX if none **/ -static INLINE int +static INLINE uint64_t lookup_volume_index_sampled_name(const struct volume_index *volume_index, - const struct uds_chunk_name *name, - struct volume_index_triage *triage) + const struct uds_chunk_name *name) { return volume_index->lookup_volume_index_sampled_name(volume_index, - name, triage); + name); } /** @@ -431,24 +340,6 @@ int __must_check put_volume_index_record(struct volume_index_record *record, int __must_check remove_volume_index_record(struct volume_index_record *record); -/** - * Restore a saved delta list - * - * @param volume_index The volume index to restore into - * @param dlsi The delta_list_save_info describing the delta list - * @param data The saved delta list bit stream - * - * @return error code or UDS_SUCCESS - **/ -static INLINE int -restore_delta_list_to_volume_index(struct volume_index *volume_index, - const struct delta_list_save_info *dlsi, - const byte data[DELTA_LIST_MAX_BYTE_COUNT]) -{ - return volume_index->restore_delta_list_to_volume_index(volume_index, - dlsi, data); -} - /** * Set the open chapter number. The volume index will be modified to index * the proper number of chapters ending with the new open chapter. @@ -533,7 +424,7 @@ set_volume_index_zone_open_chapter(struct volume_index *volume_index, static INLINE int start_restoring_volume_index(struct volume_index *volume_index, struct buffered_reader **buffered_readers, - int num_readers) + unsigned int num_readers) { return volume_index->start_restoring_volume_index(volume_index, buffered_readers, diff --git a/uds/volumeIndex005.c b/vdo/volume-index005.c similarity index 77% rename from uds/volumeIndex005.c rename to vdo/volume-index005.c index d7129e6a..22c6c621 100644 --- a/uds/volumeIndex005.c +++ b/vdo/volume-index005.c @@ -1,34 +1,18 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/krusty/src/uds/volumeIndex005.c#5 $ */ -#include "volumeIndex005.h" +#include "volume-index005.h" #include "buffer.h" #include "compiler.h" +#include "config.h" #include "errors.h" #include "geometry.h" -#include "hashUtils.h" +#include "hash-utils.h" #include "logger.h" -#include "memoryAlloc.h" +#include "memory-alloc.h" #include "uds.h" -#include "zone.h" /* * The volume index is a kept as a delta index where the payload is a @@ -56,36 +40,36 @@ */ struct volume_index_zone { - uint64_t virtual_chapter_low; // The lowest virtual chapter indexed - uint64_t virtual_chapter_high; // The highest virtual chapter indexed - long num_early_flushes; // The number of early flushes + uint64_t virtual_chapter_low; /* The lowest virtual chapter indexed */ + uint64_t virtual_chapter_high; /* The highest virtual chapter indexed */ + long num_early_flushes; /* The number of early flushes */ } __attribute__((aligned(CACHE_LINE_BYTES))); struct volume_index5 { - struct volume_index common; // Common volume index methods - struct delta_index delta_index; // The delta index - uint64_t *flush_chapters; // The first chapter to be flushed - struct volume_index_zone *zones; // The Zones - uint64_t volume_nonce; // The volume nonce - uint64_t chapter_zone_bits; // Expected size of a chapter - // (per zone) - uint64_t max_zone_bits; // Maximum size index (per zone) - unsigned int address_bits; // Number of bits in address mask - unsigned int address_mask; // Mask to get address within delta - // list - unsigned int chapter_bits; // Number of bits in chapter number - unsigned int chapter_mask; // Largest storable chapter number - unsigned int num_chapters; // Number of chapters used - unsigned int num_delta_lists; // The number of delta lists - unsigned int num_zones; // The number of zones + struct volume_index common; /* Common volume index methods */ + struct delta_index delta_index; /* The delta index */ + uint64_t *flush_chapters; /* The first chapter to be flushed */ + struct volume_index_zone *zones; /* The Zones */ + uint64_t volume_nonce; /* The volume nonce */ + uint64_t chapter_zone_bits; /* Expected size of a chapter */ + /* (per zone) */ + uint64_t max_zone_bits; /* Maximum size index (per zone) */ + unsigned int address_bits; /* Number of bits in address mask */ + unsigned int address_mask; /* Mask to get address within delta */ + /* list */ + unsigned int chapter_bits; /* Number of bits in chapter number */ + unsigned int chapter_mask; /* Largest storable chapter number */ + unsigned int num_chapters; /* Number of chapters used */ + unsigned int num_delta_lists; /* The number of delta lists */ + unsigned int num_zones; /* The number of zones */ }; struct chapter_range { - unsigned int chapter_start; // The first chapter - unsigned int chapter_count; // The number of chapters + unsigned int chapter_start; /* The first chapter */ + unsigned int chapter_count; /* The number of chapters */ }; -// Constants for the magic byte of a volume_index_record +/* Constants for the magic byte of a volume_index_record */ static const byte volume_index_record_magic = 0xAA; static const byte bad_magic = 0; @@ -123,6 +107,7 @@ static INLINE unsigned int extract_dlist_num(const struct volume_index5 *vi5, const struct uds_chunk_name *name) { uint64_t bits = extract_volume_index_bytes(name); + return (bits >> vi5->address_bits) % vi5->num_delta_lists; } @@ -198,7 +183,6 @@ is_virtual_chapter_indexed(const struct volume_index_record *record, (virtual_chapter <= volume_index_zone->virtual_chapter_high)); } -/**********************************************************************/ /** * Flush an invalid entry from the volume index, advancing to the next * valid entry. @@ -219,6 +203,7 @@ flush_invalid_entries(struct volume_index_record *record, struct volume_index5, common); int result = next_delta_index_entry(&record->delta_entry); + if (result != UDS_SUCCESS) { return result; } @@ -265,7 +250,7 @@ static int get_volume_index_entry(struct volume_index_record *record, unsigned int next_chapter_to_invalidate = vi5->chapter_mask; int result = start_delta_index_search(&vi5->delta_index, list_number, - 0, false, &record->delta_entry); + 0, &record->delta_entry); if (result != UDS_SUCCESS) { return result; } @@ -283,12 +268,13 @@ static int get_volume_index_entry(struct volume_index_record *record, return result; } - // We probably found the record we want, but we need to keep going + /* We probably found the record we want, but we need to keep going */ other_record = *record; if (!other_record.delta_entry.at_end && (key == other_record.delta_entry.key)) { for (;;) { byte collision_name[UDS_CHUNK_NAME_SIZE]; + result = flush_invalid_entries(&other_record, flush_range, &next_chapter_to_invalidate); @@ -307,8 +293,10 @@ static int get_volume_index_entry(struct volume_index_record *record, if (memcmp(collision_name, record->name, UDS_CHUNK_NAME_SIZE) == 0) { - // This collision record is the one we are - // looking for + /* + * This collision record is the one we are + * looking for + */ *record = other_record; break; } @@ -329,7 +317,6 @@ static int get_volume_index_entry(struct volume_index_record *record, return UDS_SUCCESS; } -/**********************************************************************/ /** * Terminate and clean up the volume index * @@ -361,7 +348,7 @@ enum { MAGIC_SIZE = 8 }; static const char MAGIC_START[] = "MI5-0005"; struct vi005_data { - char magic[MAGIC_SIZE]; // MAGIC_START + char magic[MAGIC_SIZE]; /* MAGIC_START */ uint64_t volume_nonce; uint64_t virtual_chapter_low; uint64_t virtual_chapter_high; @@ -369,7 +356,6 @@ struct vi005_data { unsigned int num_lists; }; -/**********************************************************************/ /** * Set the tag value used when saving and/or restoring a volume index. * @@ -384,11 +370,11 @@ static void set_volume_index_tag_005(struct volume_index *volume_index, set_delta_index_tag(&vi5->delta_index, tag); } -/**********************************************************************/ static int __must_check encode_volume_index_header(struct buffer *buffer, struct vi005_data *header) { int result = put_bytes(buffer, MAGIC_SIZE, MAGIC_START); + if (result != UDS_SUCCESS) { return result; } @@ -442,14 +428,14 @@ start_saving_volume_index_005(const struct volume_index *volume_index, struct volume_index_zone *volume_index_zone = &vi5->zones[zone_number]; unsigned int first_list = - get_delta_index_zone_first_list(&vi5->delta_index, - zone_number); + get_delta_zone_first_list(&vi5->delta_index, zone_number); unsigned int num_lists = - get_delta_index_zone_num_lists(&vi5->delta_index, zone_number); + get_delta_zone_list_count(&vi5->delta_index, zone_number); struct vi005_data header; uint64_t *first_flush_chapter; struct buffer *buffer; + memset(&header, 0, sizeof(header)); memcpy(header.magic, MAGIC_START, MAGIC_SIZE); header.volume_nonce = vi5->volume_nonce; @@ -504,27 +490,6 @@ start_saving_volume_index_005(const struct volume_index *volume_index, buffered_writer); } -/**********************************************************************/ -/** - * Have all the data been written while saving a volume index to an output - * stream? If the answer is yes, it is still necessary to call - * finish_saving_volume_index(), which will return quickly. - * - * @param volume_index The volume index - * @param zone_number The number of the zone to save - * - * @return true if all the data are written - **/ -static bool -is_saving_volume_index_done_005(const struct volume_index *volume_index, - unsigned int zone_number) -{ - const struct volume_index5 *vi5 = - const_container_of(volume_index, struct volume_index5, common); - return is_saving_delta_index_done(&vi5->delta_index, zone_number); -} - -/**********************************************************************/ /** * Finish saving a volume index to an output stream. Force the writing of * all of the remaining data. If an error occurred asynchronously during @@ -544,26 +509,6 @@ finish_saving_volume_index_005(const struct volume_index *volume_index, return finish_saving_delta_index(&vi5->delta_index, zone_number); } -/**********************************************************************/ -/** - * Abort saving a volume index to an output stream. If an error occurred - * asynchronously during the save operation, it will be dropped. - * - * @param volume_index The volume index - * @param zone_number The number of the zone to save - * - * @return UDS_SUCCESS on success, or an error code on failure - **/ -static int -abort_saving_volume_index_005(const struct volume_index *volume_index, - unsigned int zone_number) -{ - const struct volume_index5 *vi5 = - const_container_of(volume_index, struct volume_index5, common); - return abort_saving_delta_index(&vi5->delta_index, zone_number); -} - -/**********************************************************************/ static int __must_check decode_volume_index_header(struct buffer *buffer, struct vi005_data *header) { @@ -600,7 +545,7 @@ static int __must_check decode_volume_index_header(struct buffer *buffer, buffer_length(buffer) - content_length(buffer), buffer_length(buffer)); if (result != UDS_SUCCESS) { - result = UDS_CORRUPT_COMPONENT; + result = UDS_CORRUPT_DATA; } return result; } @@ -617,14 +562,15 @@ static int __must_check decode_volume_index_header(struct buffer *buffer, static int start_restoring_volume_index_005(struct volume_index *volume_index, struct buffered_reader **buffered_readers, - int num_readers) + unsigned int num_readers) { unsigned int z; int result; struct volume_index5 *vi5; uint64_t *first_flush_chapter; uint64_t virtual_chapter_low = 0, virtual_chapter_high = 0; - int i; + unsigned int i; + if (volume_index == NULL) { return uds_log_warning_strerror(UDS_BAD_STATE, "cannot restore to null volume index"); @@ -636,6 +582,7 @@ start_restoring_volume_index_005(struct volume_index *volume_index, struct buffer *buffer; struct vi005_data header; int result = make_buffer(sizeof(struct vi005_data), &buffer); + if (result != UDS_SUCCESS) { return result; } @@ -662,14 +609,14 @@ start_restoring_volume_index_005(struct volume_index *volume_index, } if (memcmp(header.magic, MAGIC_START, MAGIC_SIZE) != 0) { - return uds_log_warning_strerror(UDS_CORRUPT_COMPONENT, + return uds_log_warning_strerror(UDS_CORRUPT_DATA, "volume index file had bad magic number"); } if (vi5->volume_nonce == 0) { vi5->volume_nonce = header.volume_nonce; } else if (header.volume_nonce != vi5->volume_nonce) { - return uds_log_warning_strerror(UDS_CORRUPT_COMPONENT, + return uds_log_warning_strerror(UDS_CORRUPT_DATA, "volume index volume nonce incorrect"); } @@ -678,7 +625,7 @@ start_restoring_volume_index_005(struct volume_index *volume_index, virtual_chapter_high = header.virtual_chapter_high; } else if (virtual_chapter_high != header.virtual_chapter_high) { - return uds_log_warning_strerror(UDS_CORRUPT_COMPONENT, + return uds_log_warning_strerror(UDS_CORRUPT_DATA, "Inconsistent volume index zone files: Chapter range is [%llu,%llu], chapter range %d is [%llu,%llu]", (unsigned long long) virtual_chapter_low, (unsigned long long) virtual_chapter_high, @@ -737,83 +684,69 @@ start_restoring_volume_index_005(struct volume_index *volume_index, return UDS_SUCCESS; } -/**********************************************************************/ /** - * Have all the data been read while restoring a volume index from an - * input stream? - * - * @param volume_index The volume index to restore into - * - * @return true if all the data are read - **/ -static bool -is_restoring_volume_index_done_005(const struct volume_index *volume_index) -{ - const struct volume_index5 *vi5 = - const_container_of(volume_index, struct volume_index5, common); - return is_restoring_delta_index_done(&vi5->delta_index); -} - -/**********************************************************************/ -/** - * Restore a saved delta list - * - * @param volume_index The volume index to restore into - * @param dlsi The delta_list_save_info describing the delta list - * @param data The saved delta list bit stream + * Abort restoring a volume index from an input stream. * - * @return error code or UDS_SUCCESS + * @param volume_index The volume index **/ -static int -restore_delta_list_to_volume_index_005(struct volume_index *volume_index, - const struct delta_list_save_info *dlsi, - const byte data[DELTA_LIST_MAX_BYTE_COUNT]) +static void abort_restoring_volume_index_005(struct volume_index *volume_index) { struct volume_index5 *vi5 = container_of(volume_index, struct volume_index5, common); - return restore_delta_list_to_delta_index(&vi5->delta_index, dlsi, data); + abort_restoring_delta_index(&vi5->delta_index); } -/**********************************************************************/ /** - * Abort restoring a volume index from an input stream. + * Finish restoring a volume index from an input stream. * - * @param volume_index The volume index + * @param volume_index The volume index to restore into + * @param buffered_readers The buffered readers to read the volume index from + * @param num_readers The number of buffered readers **/ -static void abort_restoring_volume_index_005(struct volume_index *volume_index) +static int +finish_restoring_volume_index_005(struct volume_index *volume_index, + struct buffered_reader **buffered_readers, + unsigned int num_readers) { struct volume_index5 *vi5 = container_of(volume_index, struct volume_index5, common); - abort_restoring_delta_index(&vi5->delta_index); + return finish_restoring_delta_index(&vi5->delta_index, + buffered_readers, + num_readers); } -/**********************************************************************/ static void remove_newest_chapters(struct volume_index5 *vi5, unsigned int zone_number, uint64_t virtual_chapter) { - // Get the range of delta lists belonging to this zone + /* Get the range of delta lists belonging to this zone */ unsigned int first_list = - get_delta_index_zone_first_list(&vi5->delta_index, zone_number); + get_delta_zone_first_list(&vi5->delta_index, zone_number); unsigned int num_lists = - get_delta_index_zone_num_lists(&vi5->delta_index, zone_number); + get_delta_zone_list_count(&vi5->delta_index, zone_number); unsigned int last_list = first_list + num_lists - 1; if (virtual_chapter > vi5->chapter_mask) { unsigned int i; - // The virtual chapter number is large enough so that we can - // use the normal LRU mechanism without an unsigned underflow. + /* + * The virtual chapter number is large enough so that we can + * use the normal LRU mechanism without an unsigned underflow. + */ virtual_chapter -= vi5->chapter_mask + 1; - // Eliminate the newest chapters by renumbering them to become - // the oldest chapters + /* + * Eliminate the newest chapters by renumbering them to become + * the oldest chapters + */ for (i = first_list; i <= last_list; i++) { if (virtual_chapter < vi5->flush_chapters[i]) { vi5->flush_chapters[i] = virtual_chapter; } } } else { - // Underflow will prevent the fast path. Do it the slow and - // painful way. + /* + * Underflow will prevent the fast path. Do it the slow and + * painful way. + */ struct volume_index_zone *volume_index_zone = &vi5->zones[zone_number]; unsigned int i; @@ -826,6 +759,7 @@ static void remove_newest_chapters(struct volume_index5 *vi5, .zone_number = zone_number, }; struct chapter_range range; + range.chapter_start = convert_virtual_to_index(vi5, virtual_chapter); range.chapter_count = @@ -834,12 +768,12 @@ static void remove_newest_chapters(struct volume_index5 *vi5, memset(&name, 0, sizeof(name)); for (i = first_list; i <= last_list; i++) { struct chapter_range temp_range = range; + get_volume_index_entry(&record, i, 0, &temp_range); } } } -/**********************************************************************/ /** * Set the open chapter number on a zone. The volume index zone will be * modified to index the proper number of chapters ending with the new open @@ -858,9 +792,11 @@ set_volume_index_zone_open_chapter_005(struct volume_index *volume_index, container_of(volume_index, struct volume_index5, common); struct volume_index_zone *volume_index_zone = &vi5->zones[zone_number]; - // Take care here to avoid underflow of an unsigned value. Note that - // this is the smallest valid virtual low. We may or may not actually - // use this value. + /* + * Take care here to avoid underflow of an unsigned value. Note that + * this is the smallest valid virtual low. We may or may not actually + * use this value. + */ uint64_t new_virtual_low = (virtual_chapter >= vi5->num_chapters ? virtual_chapter - vi5->num_chapters + 1 : @@ -873,37 +809,41 @@ set_volume_index_zone_open_chapter_005(struct volume_index *volume_index, * as totally before the old range, as we need to remove the * entries in the open chapter. */ - empty_delta_index_zone(&vi5->delta_index, zone_number); + empty_delta_zone(&vi5->delta_index, zone_number); volume_index_zone->virtual_chapter_low = virtual_chapter; volume_index_zone->virtual_chapter_high = virtual_chapter; } else if (virtual_chapter <= volume_index_zone->virtual_chapter_high) { - // Moving backwards and the new range overlaps the old range. - // Note that moving to the same open chapter counts as - // backwards, as we need to remove the entries in the open - // chapter. + /* + * Moving backwards and the new range overlaps the old range. + * Note that moving to the same open chapter counts as + * backwards, as we need to remove the entries in the open + * chapter. + */ remove_newest_chapters(vi5, zone_number, virtual_chapter); volume_index_zone->virtual_chapter_high = virtual_chapter; } else if (new_virtual_low < volume_index_zone->virtual_chapter_low) { - // Moving forwards and we can keep all the old chapters + /* Moving forwards and we can keep all the old chapters */ volume_index_zone->virtual_chapter_high = virtual_chapter; } else if (new_virtual_low <= volume_index_zone->virtual_chapter_high) { - // Moving forwards and we can keep some old chapters + /* Moving forwards and we can keep some old chapters */ volume_index_zone->virtual_chapter_low = new_virtual_low; volume_index_zone->virtual_chapter_high = virtual_chapter; } else { - // Moving forwards and the new range is totally after the old - // range + /* + * Moving forwards and the new range is totally after the old + * range + */ volume_index_zone->virtual_chapter_low = virtual_chapter; volume_index_zone->virtual_chapter_high = virtual_chapter; } - // Check to see if the zone data has grown to be too large + /* Check to see if the zone data has grown to be too large */ if (volume_index_zone->virtual_chapter_low < volume_index_zone->virtual_chapter_high) { uint64_t used_bits = - get_delta_index_zone_dlist_bits_used(&vi5->delta_index, - zone_number); + get_delta_zone_bits_used(&vi5->delta_index, + zone_number); if (used_bits > vi5->max_zone_bits) { - // Expire enough chapters to free the desired space + /* Expire enough chapters to free the desired space */ uint64_t expire_count = 1 + (used_bits - vi5->max_zone_bits) / vi5->chapter_zone_bits; @@ -945,7 +885,6 @@ set_volume_index_zone_open_chapter_005(struct volume_index *volume_index, } } -/**********************************************************************/ /** * Set the open chapter number. The volume index will be modified to index * the proper number of chapters ending with the new open chapter. @@ -960,9 +899,12 @@ set_volume_index_open_chapter_005(struct volume_index *volume_index, struct volume_index5 *vi5 = container_of(volume_index, struct volume_index5, common); unsigned int z; + for (z = 0; z < vi5->num_zones; z++) { - // In normal operation, we advance forward one chapter at a - // time. Log all abnormal changes. + /* + * In normal operation, we advance forward one chapter at a + * time. Log all abnormal changes. + */ struct volume_index_zone *volume_index_zone = &vi5->zones[z]; bool log_move = virtual_chapter != volume_index_zone->virtual_chapter_high + 1; @@ -985,7 +927,6 @@ set_volume_index_open_chapter_005(struct volume_index *volume_index, } } -/**********************************************************************/ /** * Find the volume index zone associated with a chunk name * @@ -1001,87 +942,79 @@ get_volume_index_zone_005(const struct volume_index *volume_index, const struct volume_index5 *vi5 = const_container_of(volume_index, struct volume_index5, common); unsigned int delta_list_number = extract_dlist_num(vi5, name); - return get_delta_index_zone(&vi5->delta_index, delta_list_number); + + return get_delta_zone_number(&vi5->delta_index, delta_list_number); } -/**********************************************************************/ /** * Do a quick read-only lookup of the chunk name and return information * needed by the index code to process the chunk name. * - * @param volume_index The volume index - * @param name The chunk name - * @param triage Information about the chunk name + * @param volume_index The volume index + * @param name The chunk name * - * @return UDS_SUCCESS or an error code + * @return The sparse virtual chapter, or UINT64_MAX if none **/ -static int -lookup_volume_index_name_005(const struct volume_index *volume_index, - const struct uds_chunk_name *name, - struct volume_index_triage *triage) +static uint64_t +lookup_volume_index_name_005(const struct volume_index *volume_index + __always_unused, + const struct uds_chunk_name *name + __always_unused) { - triage->is_sample = false; - triage->in_sampled_chapter = false; - triage->zone = get_volume_index_zone_005(volume_index, name); - return UDS_SUCCESS; + return UINT64_MAX; } -/**********************************************************************/ /** * Do a quick read-only lookup of the sampled chunk name and return * information needed by the index code to process the chunk name. * - * @param volume_index The volume index - * @param name The chunk name - * @param triage Information about the chunk name. The zone and - * is_sample fields are already filled in. Set - * in_sampled_chapter and virtual_chapter if the chunk - * name is found in the index. + * @param volume_index The volume index + * @param name The chunk name * - * @return UDS_SUCCESS or an error code + * @return The sparse virtual chapter, or UINT64_MAX if none **/ -static int +static uint64_t lookup_volume_index_sampled_name_005(const struct volume_index *volume_index, - const struct uds_chunk_name *name, - struct volume_index_triage *triage) + const struct uds_chunk_name *name) { const struct volume_index5 *vi5 = const_container_of(volume_index, struct volume_index5, common); + int result; unsigned int address = extract_address(vi5, name); unsigned int delta_list_number = extract_dlist_num(vi5, name); + unsigned int zone_number = + get_volume_index_zone_005(volume_index, name); + const struct volume_index_zone *zone = &vi5->zones[zone_number]; + uint64_t virtual_chapter; + unsigned int index_chapter; + unsigned int rolling_chapter; struct delta_index_entry delta_entry; - int result = get_delta_index_entry(&vi5->delta_index, - delta_list_number, - address, - name->name, - true, - &delta_entry); + + result = get_delta_index_entry(&vi5->delta_index, + delta_list_number, + address, + name->name, + &delta_entry); if (result != UDS_SUCCESS) { - return result; + return UINT64_MAX; } - triage->in_sampled_chapter = - !delta_entry.at_end && (delta_entry.key == address); - if (triage->in_sampled_chapter) { - const struct volume_index_zone *volume_index_zone = - &vi5->zones[triage->zone]; - unsigned int index_chapter = - get_delta_entry_value(&delta_entry); - unsigned int rolling_chapter = - ((index_chapter - - volume_index_zone->virtual_chapter_low) & - vi5->chapter_mask); - triage->virtual_chapter = - volume_index_zone->virtual_chapter_low + - rolling_chapter; - if (triage->virtual_chapter > - volume_index_zone->virtual_chapter_high) { - triage->in_sampled_chapter = false; - } + + if (delta_entry.at_end || (delta_entry.key != address)) { + return UINT64_MAX; } - return UDS_SUCCESS; + + index_chapter = get_delta_entry_value(&delta_entry); + rolling_chapter = ((index_chapter - zone->virtual_chapter_low) & + vi5->chapter_mask); + + virtual_chapter = zone->virtual_chapter_low + rolling_chapter; + if (virtual_chapter > zone->virtual_chapter_high) { + return UINT64_MAX; + } + + return virtual_chapter; } -/**********************************************************************/ /** * Find the volume index record associated with a block name * @@ -1119,12 +1052,13 @@ static int get_volume_index_record_005(struct volume_index *volume_index, unsigned int address = extract_address(vi5, name); unsigned int delta_list_number = extract_dlist_num(vi5, name); uint64_t flush_chapter = vi5->flush_chapters[delta_list_number]; + record->magic = volume_index_record_magic; record->volume_index = volume_index; record->mutex = NULL; record->name = name; record->zone_number = - get_delta_index_zone(&vi5->delta_index, delta_list_number); + get_delta_zone_number(&vi5->delta_index, delta_list_number); volume_index_zone = get_zone_for_record(record); if (flush_chapter < volume_index_zone->virtual_chapter_low) { @@ -1149,7 +1083,6 @@ static int get_volume_index_record_005(struct volume_index *volume_index, delta_list_number, address, name->name, - false, &record->delta_entry); } if (result != UDS_SUCCESS) { @@ -1167,7 +1100,6 @@ static int get_volume_index_record_005(struct volume_index *volume_index, return UDS_SUCCESS; } -/**********************************************************************/ /** * Create a new record associated with a block name. * @@ -1226,7 +1158,6 @@ int put_volume_index_record(struct volume_index_record *record, return result; } -/**********************************************************************/ static INLINE int validate_record(struct volume_index_record *record) { if (record->magic != volume_index_record_magic) { @@ -1240,7 +1171,6 @@ static INLINE int validate_record(struct volume_index_record *record) return UDS_SUCCESS; } -/**********************************************************************/ /** * Remove an existing record. * @@ -1251,10 +1181,11 @@ static INLINE int validate_record(struct volume_index_record *record) int remove_volume_index_record(struct volume_index_record *record) { int result = validate_record(record); + if (result != UDS_SUCCESS) { return result; } - // Mark the record so that it cannot be used again + /* Mark the record so that it cannot be used again */ record->magic = bad_magic; if (unlikely(record->mutex != NULL)) { uds_lock_mutex(record->mutex); @@ -1266,7 +1197,6 @@ int remove_volume_index_record(struct volume_index_record *record) return result; } -/**********************************************************************/ int set_volume_index_record_chapter(struct volume_index_record *record, uint64_t virtual_chapter) { @@ -1274,6 +1204,7 @@ int set_volume_index_record_chapter(struct volume_index_record *record, struct volume_index5, common); int result = validate_record(record); + if (result != UDS_SUCCESS) { return result; } @@ -1291,7 +1222,7 @@ int set_volume_index_record_chapter(struct volume_index_record *record, } result = set_delta_entry_value(&record->delta_entry, convert_virtual_to_index(vi5, - virtual_chapter)); + virtual_chapter)); if (unlikely(record->mutex != NULL)) { uds_unlock_mutex(record->mutex); } @@ -1302,24 +1233,6 @@ int set_volume_index_record_chapter(struct volume_index_record *record, return UDS_SUCCESS; } -/**********************************************************************/ -/** - * Get the number of bytes used for volume index entries. - * - * @param volume_index The volume index - * - * @return The number of bytes in use - **/ -static size_t -get_volume_index_memory_used_005(const struct volume_index *volume_index) -{ - const struct volume_index5 *vi5 = - const_container_of(volume_index, struct volume_index5, common); - uint64_t bits = get_delta_index_dlist_bits_used(&vi5->delta_index); - return (bits + CHAR_BIT - 1) / CHAR_BIT; -} - -/**********************************************************************/ /** * Return the volume index stats. There is only one portion of the volume * index in this implementation, and we call it the dense portion of the @@ -1337,6 +1250,7 @@ static void get_volume_index_stats_005(const struct volume_index *volume_index, const_container_of(volume_index, struct volume_index5, common); struct delta_index_stats dis; unsigned int z; + get_delta_index_stats(&vi5->delta_index, &dis); dense->memory_allocated = (dis.memory_allocated + sizeof(struct volume_index5) + @@ -1348,7 +1262,7 @@ static void get_volume_index_stats_005(const struct volume_index *volume_index, dense->collision_count = dis.collision_count; dense->discard_count = dis.discard_count; dense->overflow_count = dis.overflow_count; - dense->num_lists = dis.num_lists; + dense->num_lists = dis.list_count; dense->early_flushes = 0; for (z = 0; z < vi5->num_zones; z++) { dense->early_flushes += vi5->zones[z].num_early_flushes; @@ -1356,7 +1270,6 @@ static void get_volume_index_stats_005(const struct volume_index *volume_index, memset(sparse, 0, sizeof(struct volume_index_stats)); } -/**********************************************************************/ /** * Determine whether a given chunk name is a hook. * @@ -1373,19 +1286,17 @@ static bool is_volume_index_sample_005(const struct volume_index *volume_index return false; } -/**********************************************************************/ struct parameters005 { - unsigned int address_bits; // Number of bits in address mask - unsigned int chapter_bits; // Number of bits in chapter number - unsigned int mean_delta; // The mean delta - unsigned long num_delta_lists; // The number of delta lists - unsigned long num_chapters; // Number of chapters used - size_t num_bits_per_chapter; // Number of bits per chapter - size_t memory_size; // Number of bytes of delta list memory - size_t target_free_size; // Number of free bytes we desire + unsigned int address_bits; /* Number of bits in address mask */ + unsigned int chapter_bits; /* Number of bits in chapter number */ + unsigned int mean_delta; /* The mean delta */ + unsigned long num_delta_lists; /* The number of delta lists */ + unsigned long num_chapters; /* Number of chapters used */ + size_t num_bits_per_chapter; /* Number of bits per chapter */ + size_t memory_size; /* Number of bytes of delta list memory */ + size_t target_free_size; /* Number of free bytes we desire */ }; -/**********************************************************************/ static int compute_volume_index_parameters005(const struct configuration *config, struct parameters005 *params) @@ -1414,6 +1325,7 @@ compute_volume_index_parameters005(const struct configuration *config, struct geometry *geometry = config->geometry; unsigned long records_per_chapter = geometry->records_per_chapter; + params->num_chapters = geometry->chapters_per_volume; /* * Make sure that the number of delta list records in the @@ -1441,7 +1353,7 @@ compute_volume_index_parameters005(const struct configuration *config, "cannot initialize volume index with %u address bits", params->address_bits); } - if (is_sparse(geometry)) { + if (is_sparse_geometry(geometry)) { return uds_log_warning_strerror(UDS_INVALID_ARGUMENT, "cannot initialize dense volume index with %u sparse chapters", geometry->sparse_chapters_per_volume); @@ -1481,15 +1393,16 @@ compute_volume_index_parameters005(const struct configuration *config, chapters_in_volume_index = rounded_chapters + invalid_chapters; entries_in_volume_index = records_per_chapter * chapters_in_volume_index; - // Compute the mean delta + /* Compute the mean delta */ address_span = (uint64_t) params->num_delta_lists << params->address_bits; params->mean_delta = address_span / entries_in_volume_index; - // Project how large we expect a chapter to be + /* Project how large we expect a chapter to be */ params->num_bits_per_chapter = - get_delta_memory_size(records_per_chapter, params->mean_delta, - params->chapter_bits); - // Project how large we expect the index to be + compute_delta_index_size(records_per_chapter, + params->mean_delta, + params->chapter_bits); + /* Project how large we expect the index to be */ num_bits_per_index = params->num_bits_per_chapter * chapters_in_volume_index; expected_index_size = num_bits_per_index / CHAR_BIT; @@ -1500,22 +1413,24 @@ compute_volume_index_parameters005(const struct configuration *config, * VolumeIndex_p1 to tune this setting. */ params->memory_size = expected_index_size * 106 / 100; - // Set the target free size to 5% of the expected index size + /* Set the target free size to 5% of the expected index size */ params->target_free_size = expected_index_size / 20; return UDS_SUCCESS; } -/**********************************************************************/ int compute_volume_index_save_bytes005(const struct configuration *config, size_t *num_bytes) { struct parameters005 params = { .address_bits = 0 }; int result = compute_volume_index_parameters005(config, ¶ms); + if (result != UDS_SUCCESS) { return result; } - // Saving a volume index 005 needs a header plus one uint64_t per delta - // list plus the delta index. + /* + * Saving a volume index 005 needs a header plus one uint64_t per delta + * list plus the delta index. + */ *num_bytes = (sizeof(struct vi005_data) + params.num_delta_lists * sizeof(uint64_t) + compute_delta_index_save_bytes(params.num_delta_lists, @@ -1523,15 +1438,15 @@ int compute_volume_index_save_bytes005(const struct configuration *config, return UDS_SUCCESS; } -/**********************************************************************/ int make_volume_index005(const struct configuration *config, - unsigned int num_zones, uint64_t volume_nonce, struct volume_index **volume_index) { struct volume_index5 *vi5; struct parameters005 params = { .address_bits = 0 }; + unsigned int num_zones = config->zone_count; int result = compute_volume_index_parameters005(config, ¶ms); + if (result != UDS_SUCCESS) { return result; } @@ -1544,25 +1459,18 @@ int make_volume_index005(const struct configuration *config, vi5->common.abort_restoring_volume_index = abort_restoring_volume_index_005; - vi5->common.abort_saving_volume_index = abort_saving_volume_index_005; + vi5->common.finish_restoring_volume_index = + finish_restoring_volume_index_005; vi5->common.finish_saving_volume_index = finish_saving_volume_index_005; vi5->common.free_volume_index = free_volume_index_005; - vi5->common.get_volume_index_memory_used = - get_volume_index_memory_used_005; vi5->common.get_volume_index_record = get_volume_index_record_005; vi5->common.get_volume_index_stats = get_volume_index_stats_005; vi5->common.get_volume_index_zone = get_volume_index_zone_005; vi5->common.is_volume_index_sample = is_volume_index_sample_005; - vi5->common.is_restoring_volume_index_done = - is_restoring_volume_index_done_005; - vi5->common.is_saving_volume_index_done = - is_saving_volume_index_done_005; vi5->common.lookup_volume_index_name = lookup_volume_index_name_005; vi5->common.lookup_volume_index_sampled_name = lookup_volume_index_sampled_name_005; - vi5->common.restore_delta_list_to_volume_index = - restore_delta_list_to_volume_index_005; vi5->common.set_volume_index_open_chapter = set_volume_index_open_chapter_005; vi5->common.set_volume_index_tag = set_volume_index_tag_005; @@ -1590,12 +1498,14 @@ int make_volume_index005(const struct configuration *config, params.memory_size); if (result == UDS_SUCCESS) { vi5->max_zone_bits = - ((get_delta_index_dlist_bits_allocated(&vi5->delta_index) - + ((get_delta_index_bits_allocated(&vi5->delta_index) - params.target_free_size * CHAR_BIT) / num_zones); } - // Initialize the chapter flush ranges to be empty. This depends upon - // allocate returning zeroed memory. + /* + * Initialize the chapter flush ranges to be empty. This depends upon + * allocate returning zeroed memory. + */ if (result == UDS_SUCCESS) { result = UDS_ALLOCATE(params.num_delta_lists, uint64_t, @@ -1603,8 +1513,10 @@ int make_volume_index005(const struct configuration *config, &vi5->flush_chapters); } - // Initialize the virtual chapter ranges to start at zero. This - // depends upon allocate returning zeroed memory. + /* + * Initialize the virtual chapter ranges to start at zero. This + * depends upon allocate returning zeroed memory. + */ if (result == UDS_SUCCESS) { result = UDS_ALLOCATE(num_zones, struct volume_index_zone, diff --git a/vdo/volume-index005.h b/vdo/volume-index005.h new file mode 100644 index 00000000..781373a7 --- /dev/null +++ b/vdo/volume-index005.h @@ -0,0 +1,37 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright Red Hat + */ + +#ifndef VOLUMEINDEX005_H +#define VOLUMEINDEX005_H 1 + +#include "volume-index-ops.h" + +/** + * Make a new volume index. + * + * @param config The configuration of the volume index + * @param volume_nonce The nonce used to authenticate the index + * @param volume_index Location to hold new volume index ptr + * + * @return error code or UDS_SUCCESS + **/ +int __must_check make_volume_index005(const struct configuration *config, + uint64_t volume_nonce, + struct volume_index **volume_index); + +/** + * Compute the number of bytes required to save a volume index of a given + * configuration. + * + * @param config The configuration of the volume index + * @param num_bytes The number of bytes required to save the volume index + * + * @return UDS_SUCCESS or an error code. + **/ +int __must_check +compute_volume_index_save_bytes005(const struct configuration *config, + size_t *num_bytes); + +#endif /* VOLUMEINDEX005_H */ diff --git a/uds/volumeIndex006.c b/vdo/volume-index006.c similarity index 69% rename from uds/volumeIndex006.c rename to vdo/volume-index006.c index bbb5a9eb..7d3de953 100644 --- a/uds/volumeIndex006.c +++ b/vdo/volume-index006.c @@ -1,35 +1,18 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/krusty/src/uds/volumeIndex006.c#5 $ */ -#include "volumeIndex006.h" +#include "volume-index006.h" #include "buffer.h" #include "compiler.h" #include "errors.h" -#include "hashUtils.h" +#include "hash-utils.h" #include "logger.h" -#include "volumeIndex005.h" -#include "memoryAlloc.h" +#include "volume-index005.h" +#include "memory-alloc.h" #include "permassert.h" #include "uds-threads.h" -#include "uds.h" /* * The volume index is a kept as a wrapper around 2 volume index @@ -50,16 +33,16 @@ */ struct volume_index_zone { - struct mutex hook_mutex; // Protects the sampled index in this zone + struct mutex hook_mutex; /* Protects the sampled index in this zone */ } __attribute__((aligned(CACHE_LINE_BYTES))); struct volume_index6 { - struct volume_index common; // Common volume index methods - unsigned int sparse_sample_rate; // The sparse sample rate - unsigned int num_zones; // The number of zones - struct volume_index *vi_non_hook; // The non-hook index - struct volume_index *vi_hook; // Hook index == sample index - struct volume_index_zone *zones; // The zones + struct volume_index common; /* Common volume index methods */ + unsigned int sparse_sample_rate; /* The sparse sample rate */ + unsigned int num_zones; /* The number of zones */ + struct volume_index *vi_non_hook; /* The non-hook index */ + struct volume_index *vi_hook; /* Hook index == sample index */ + struct volume_index_zone *zones; /* The zones */ }; /** @@ -79,7 +62,6 @@ is_volume_index_sample_006(const struct volume_index *volume_index, return (extract_sampling_bytes(name) % vi6->sparse_sample_rate) == 0; } -/**********************************************************************/ /** * Get the subindex for the given chunk name * @@ -99,7 +81,6 @@ get_sub_index(const struct volume_index *volume_index, vi6->vi_non_hook); } -/**********************************************************************/ /** * Terminate and clean up the volume index * @@ -113,6 +94,7 @@ static void free_volume_index_006(struct volume_index *volume_index) common); if (vi6->zones != NULL) { unsigned int zone; + for (zone = 0; zone < vi6->num_zones; zone++) { uds_destroy_mutex(&vi6->zones[zone].hook_mutex); } @@ -131,7 +113,6 @@ static void free_volume_index_006(struct volume_index *volume_index) } } -/**********************************************************************/ /** * Constants and structures for the saved volume index region. "MI6" * indicates volume index 006, and "-XXXX" is a number to increment @@ -143,11 +124,10 @@ enum { MAGIC_SIZE = 8 }; static const char MAGIC_START[] = "MI6-0001"; struct vi006_data { - char magic[MAGIC_SIZE]; // MAGIC_START + char magic[MAGIC_SIZE]; /* MAGIC_START */ unsigned int sparse_sample_rate; }; -/**********************************************************************/ /** * Set the tag value used when saving and/or restoring a volume index. * @@ -160,11 +140,11 @@ static void set_volume_index_tag_006(struct volume_index *volume_index { } -/**********************************************************************/ static int __must_check encode_volume_index_header(struct buffer *buffer, struct vi006_data *header) { int result = put_bytes(buffer, MAGIC_SIZE, MAGIC_START); + if (result != UDS_SUCCESS) { return result; } @@ -199,6 +179,7 @@ start_saving_volume_index_006(const struct volume_index *volume_index, const_container_of(volume_index, struct volume_index6, common); struct buffer *buffer; int result = make_buffer(sizeof(struct vi006_data), &buffer); + if (result != UDS_SUCCESS) { return result; } @@ -235,28 +216,6 @@ start_saving_volume_index_006(const struct volume_index *volume_index, return UDS_SUCCESS; } -/**********************************************************************/ -/** - * Have all the data been written while saving a volume index to an output - * stream? If the answer is yes, it is still necessary to call - * finish_saving_volume_index(), which will return quickly. - * - * @param volume_index The volume index - * @param zone_number The number of the zone to save - * - * @return true if all the data are written - **/ -static bool -is_saving_volume_index_done_006(const struct volume_index *volume_index, - unsigned int zone_number) -{ - const struct volume_index6 *vi6 = - const_container_of(volume_index, struct volume_index6, common); - return (is_saving_volume_index_done(vi6->vi_non_hook, zone_number) && - is_saving_volume_index_done(vi6->vi_hook, zone_number)); -} - -/**********************************************************************/ /** * Finish saving a volume index to an output stream. Force the writing of * all of the remaining data. If an error occurred asynchronously during @@ -274,37 +233,13 @@ finish_saving_volume_index_006(const struct volume_index *volume_index, const struct volume_index6 *vi6 = const_container_of(volume_index, struct volume_index6, common); int result = finish_saving_volume_index(vi6->vi_non_hook, zone_number); - if (result == UDS_SUCCESS) { - result = finish_saving_volume_index(vi6->vi_hook, zone_number); - } - return result; -} -/**********************************************************************/ -/** - * Abort saving a volume index to an output stream. If an error occurred - * asynchronously during the save operation, it will be dropped. - * - * @param volume_index The volume index - * @param zone_number The number of the zone to save - * - * @return UDS_SUCCESS on success, or an error code on failure - **/ -static int -abort_saving_volume_index_006(const struct volume_index *volume_index, - unsigned int zone_number) -{ - const struct volume_index6 *vi6 = - const_container_of(volume_index, struct volume_index6, common); - int result = abort_saving_volume_index(vi6->vi_non_hook, zone_number); - int result2 = abort_saving_volume_index(vi6->vi_hook, zone_number); if (result == UDS_SUCCESS) { - result = result2; + result = finish_saving_volume_index(vi6->vi_hook, zone_number); } return result; } -/**********************************************************************/ static int __must_check decode_volume_index_header(struct buffer *buffer, struct vi006_data *header) { @@ -324,7 +259,7 @@ static int __must_check decode_volume_index_header(struct buffer *buffer, buffer_length(buffer) - content_length(buffer), buffer_length(buffer)); if (result != UDS_SUCCESS) { - result = UDS_CORRUPT_COMPONENT; + result = UDS_CORRUPT_DATA; } return result; } @@ -341,11 +276,11 @@ static int __must_check decode_volume_index_header(struct buffer *buffer, static int start_restoring_volume_index_006(struct volume_index *volume_index, struct buffered_reader **buffered_readers, - int num_readers) + unsigned int num_readers) { struct volume_index6 *vi6 = container_of(volume_index, struct volume_index6, common); - int i; + unsigned int i; int result = ASSERT_WITH_ERROR_CODE(volume_index != NULL, UDS_BAD_STATE, @@ -357,6 +292,7 @@ start_restoring_volume_index_006(struct volume_index *volume_index, for (i = 0; i < num_readers; i++) { struct vi006_data header; struct buffer *buffer; + result = make_buffer(sizeof(struct vi006_data), &buffer); if (result != UDS_SUCCESS) { return result; @@ -384,7 +320,7 @@ start_restoring_volume_index_006(struct volume_index *volume_index, } if (memcmp(header.magic, MAGIC_START, MAGIC_SIZE) != 0) { - return uds_log_warning_strerror(UDS_CORRUPT_COMPONENT, + return uds_log_warning_strerror(UDS_CORRUPT_DATA, "volume index file had bad magic number"); } @@ -392,11 +328,11 @@ start_restoring_volume_index_006(struct volume_index *volume_index, vi6->sparse_sample_rate = header.sparse_sample_rate; } else if (vi6->sparse_sample_rate != header.sparse_sample_rate) { - uds_log_warning_strerror(UDS_CORRUPT_COMPONENT, + uds_log_warning_strerror(UDS_CORRUPT_DATA, "Inconsistent sparse sample rate in delta index zone files: %u vs. %u", vi6->sparse_sample_rate, header.sparse_sample_rate); - return UDS_CORRUPT_COMPONENT; + return UDS_CORRUPT_DATA; } } @@ -410,67 +346,47 @@ start_restoring_volume_index_006(struct volume_index *volume_index, num_readers); } -/**********************************************************************/ /** - * Have all the data been read while restoring a volume index from an - * input stream? - * - * @param volume_index The volume index to restore into + * Abort restoring a volume index from an input stream. * - * @return true if all the data are read + * @param volume_index The volume index **/ -static bool -is_restoring_volume_index_done_006(const struct volume_index *volume_index) +static void abort_restoring_volume_index_006(struct volume_index *volume_index) { - const struct volume_index6 *vi6 = - const_container_of(volume_index, struct volume_index6, common); - return (is_restoring_volume_index_done(vi6->vi_non_hook) && - is_restoring_volume_index_done(vi6->vi_hook)); + struct volume_index6 *vi6 = + container_of(volume_index, struct volume_index6, common); + abort_restoring_volume_index(vi6->vi_non_hook); + abort_restoring_volume_index(vi6->vi_hook); } -/**********************************************************************/ /** - * Restore a saved delta list - * - * @param volume_index The volume index to restore into - * @param dlsi The delta_list_save_info describing the delta list - * @param data The saved delta list bit stream + * Finish restoring a volume index from an input stream. * - * @return error code or UDS_SUCCESS + * @param volume_index The volume index to restore into + * @param buffered_readers The buffered readers to read the volume index from + * @param num_readers The number of buffered readers **/ static int -restore_delta_list_to_volume_index_006(struct volume_index *volume_index, - const struct delta_list_save_info *dlsi, - const byte data[DELTA_LIST_MAX_BYTE_COUNT]) +finish_restoring_volume_index_006(struct volume_index *volume_index, + struct buffered_reader **buffered_readers, + unsigned int num_readers) { + int result; struct volume_index6 *vi6 = container_of(volume_index, struct volume_index6, common); - int result = restore_delta_list_to_volume_index(vi6->vi_non_hook, - dlsi, - data); + + result = finish_restoring_volume_index(vi6->vi_non_hook, + buffered_readers, + num_readers); if (result != UDS_SUCCESS) { - result = restore_delta_list_to_volume_index(vi6->vi_hook, - dlsi, - data); + return result; } - return result; -} -/**********************************************************************/ -/** - * Abort restoring a volume index from an input stream. - * - * @param volume_index The volume index - **/ -static void abort_restoring_volume_index_006(struct volume_index *volume_index) -{ - struct volume_index6 *vi6 = - container_of(volume_index, struct volume_index6, common); - abort_restoring_volume_index(vi6->vi_non_hook); - abort_restoring_volume_index(vi6->vi_hook); + return finish_restoring_volume_index(vi6->vi_hook, + buffered_readers, + num_readers); } -/**********************************************************************/ /** * Set the open chapter number on a zone. The volume index zone will be * modified to index the proper number of chapters ending with the new open @@ -488,18 +404,20 @@ set_volume_index_zone_open_chapter_006(struct volume_index *volume_index, struct volume_index6 *vi6 = container_of(volume_index, struct volume_index6, common); struct mutex *mutex = &vi6->zones[zone_number].hook_mutex; + set_volume_index_zone_open_chapter(vi6->vi_non_hook, zone_number, virtual_chapter); - // We need to prevent a lookup_volume_index_name() happening while we - // are changing the open chapter number + /* + * We need to prevent a lookup_volume_index_name() happening while we + * are changing the open chapter number + */ uds_lock_mutex(mutex); set_volume_index_zone_open_chapter(vi6->vi_hook, zone_number, virtual_chapter); uds_unlock_mutex(mutex); } -/**********************************************************************/ /** * Set the open chapter number. The volume index will be modified to index * the proper number of chapters ending with the new open chapter. @@ -514,13 +432,13 @@ set_volume_index_open_chapter_006(struct volume_index *volume_index, struct volume_index6 *vi6 = container_of(volume_index, struct volume_index6, common); unsigned int zone; + for (zone = 0; zone < vi6->num_zones; zone++) { set_volume_index_zone_open_chapter_006(volume_index, zone, virtual_chapter); } } -/**********************************************************************/ /** * Find the volume index zone associated with a chunk name * @@ -536,66 +454,56 @@ get_volume_index_zone_006(const struct volume_index *volume_index, return get_volume_index_zone(get_sub_index(volume_index, name), name); } -/**********************************************************************/ /** * Do a quick read-only lookup of the chunk name and return information * needed by the index code to process the chunk name. * * @param volume_index The volume index * @param name The chunk name - * @param triage Information about the chunk name * - * @return UDS_SUCCESS or an error code + * @return The sparse virtual chapter, or UINT64_MAX if none **/ -static int +static uint64_t lookup_volume_index_name_006(const struct volume_index *volume_index, - const struct uds_chunk_name *name, - struct volume_index_triage *triage) + const struct uds_chunk_name *name) { - int result = UDS_SUCCESS; const struct volume_index6 *vi6 = const_container_of(volume_index, struct volume_index6, common); - triage->is_sample = is_volume_index_sample_006(volume_index, name); - triage->in_sampled_chapter = false; - triage->zone = get_volume_index_zone_006(volume_index, name); - if (triage->is_sample) { - struct mutex *mutex = - &vi6->zones[triage->zone].hook_mutex; - uds_lock_mutex(mutex); - result = lookup_volume_index_sampled_name(vi6->vi_hook, name, - triage); - uds_unlock_mutex(mutex); + unsigned int zone_number = + get_volume_index_zone_006(volume_index, name); + struct mutex *mutex = &vi6->zones[zone_number].hook_mutex; + uint64_t virtual_chapter; + + if (!is_volume_index_sample_006(volume_index, name)) { + return UINT64_MAX; } - return result; + + uds_lock_mutex(mutex); + virtual_chapter = lookup_volume_index_sampled_name(vi6->vi_hook, name); + uds_unlock_mutex(mutex); + + return virtual_chapter; } -/**********************************************************************/ /** * Do a quick read-only lookup of the sampled chunk name and return * information needed by the index code to process the chunk name. * * @param volume_index The volume index * @param name The chunk name - * @param triage Information about the chunk name. The zone and - * is_sample fields are already filled in. Set - * in_sampled_chapter and virtual_chapter if the chunk - * name is found in the index. * - * @return UDS_SUCCESS or an error code + * @return The sparse virtual chapter, or UINT64_MAX if none **/ -static int +static uint64_t lookup_volume_index_sampled_name_006(const struct volume_index *volume_index __always_unused, const struct uds_chunk_name *name - __always_unused, - struct volume_index_triage *triage __always_unused) { - return ASSERT_WITH_ERROR_CODE(false, UDS_BAD_STATE, - "%s should not be called", __func__); + /* FIXME: This should never get called. */ + return UINT64_MAX; } -/**********************************************************************/ /** * Find the volume index record associated with a block name * @@ -629,6 +537,7 @@ static int get_volume_index_record_006(struct volume_index *volume_index, const struct volume_index6 *vi6 = const_container_of(volume_index, struct volume_index6, common); int result; + if (is_volume_index_sample_006(volume_index, name)) { /* * We need to prevent a lookup_volume_index_name() happening @@ -638,11 +547,14 @@ static int get_volume_index_record_006(struct volume_index *volume_index, */ unsigned int zone = get_volume_index_zone(vi6->vi_hook, name); struct mutex *mutex = &vi6->zones[zone].hook_mutex; + uds_lock_mutex(mutex); result = get_volume_index_record(vi6->vi_hook, name, record); uds_unlock_mutex(mutex); - // Remember the mutex so that other operations on the - // volume_index_record can use it + /* + * Remember the mutex so that other operations on the + * volume_index_record can use it + */ record->mutex = mutex; } else { result = get_volume_index_record(vi6->vi_non_hook, name, @@ -651,24 +563,6 @@ static int get_volume_index_record_006(struct volume_index *volume_index, return result; } -/**********************************************************************/ -/** - * Get the number of bytes used for volume index entries. - * - * @param volume_index The volume index - * - * @return The number of bytes in use - **/ -static size_t -get_volume_index_memory_used_006(const struct volume_index *volume_index) -{ - const struct volume_index6 *vi6 = - const_container_of(volume_index, struct volume_index6, common); - return (get_volume_index_memory_used(vi6->vi_non_hook) + - get_volume_index_memory_used(vi6->vi_hook)); -} - -/**********************************************************************/ /** * Return the volume index stats. There is only one portion of the volume * index in this implementation, and we call it the dense portion of the @@ -685,20 +579,19 @@ static void get_volume_index_stats_006(const struct volume_index *volume_index, const struct volume_index6 *vi6 = const_container_of(volume_index, struct volume_index6, common); struct volume_index_stats dummy_stats; + get_volume_index_stats(vi6->vi_non_hook, dense, &dummy_stats); get_volume_index_stats(vi6->vi_hook, sparse, &dummy_stats); } -/**********************************************************************/ struct split_config { - struct configuration hook_config; // Describe hook part of the index + struct configuration hook_config; /* Describe hook part of the index */ struct geometry hook_geometry; - struct configuration non_hook_config; // Describe non-hook part of the - // index + struct configuration non_hook_config; /* Describe non-hook part of the */ + /* index */ struct geometry non_hook_geometry; }; -/**********************************************************************/ static int split_configuration006(const struct configuration *config, struct split_config *split) { @@ -718,7 +611,7 @@ static int split_configuration006(const struct configuration *config, return result; } - // Start with copies of the base configuration + /* Start with copies of the base configuration */ split->hook_config = *config; split->hook_geometry = *config->geometry; split->hook_config.geometry = &split->hook_geometry; @@ -732,24 +625,24 @@ static int split_configuration006(const struct configuration *config, num_dense_chapters = num_chapters - num_sparse_chapters; sample_records = config->geometry->records_per_chapter / sample_rate; - // Adjust the number of records indexed for each chapter + /* Adjust the number of records indexed for each chapter */ split->hook_geometry.records_per_chapter = sample_records; split->non_hook_geometry.records_per_chapter -= sample_records; - // Adjust the number of chapters indexed + /* Adjust the number of chapters indexed */ split->hook_geometry.sparse_chapters_per_volume = 0; split->non_hook_geometry.sparse_chapters_per_volume = 0; split->non_hook_geometry.chapters_per_volume = num_dense_chapters; return UDS_SUCCESS; } -/**********************************************************************/ int compute_volume_index_save_bytes006(const struct configuration *config, size_t *num_bytes) { size_t hook_bytes, non_hook_bytes; struct split_config split; int result = split_configuration006(config, &split); + if (result != UDS_SUCCESS) { return result; } @@ -763,15 +656,15 @@ int compute_volume_index_save_bytes006(const struct configuration *config, if (result != UDS_SUCCESS) { return result; } - // Saving a volume index 006 needs a header plus the hook index plus - // the non-hook index + /* + * Saving a volume index 006 needs a header plus the hook index plus + * the non-hook index + */ *num_bytes = sizeof(struct vi006_data) + hook_bytes + non_hook_bytes; return UDS_SUCCESS; } -/**********************************************************************/ int make_volume_index006(const struct configuration *config, - unsigned int num_zones, uint64_t volume_nonce, struct volume_index **volume_index) { @@ -779,6 +672,7 @@ int make_volume_index006(const struct configuration *config, unsigned int zone; struct volume_index6 *vi6; int result = split_configuration006(config, &split); + if (result != UDS_SUCCESS) { return result; } @@ -790,25 +684,18 @@ int make_volume_index006(const struct configuration *config, vi6->common.abort_restoring_volume_index = abort_restoring_volume_index_006; - vi6->common.abort_saving_volume_index = abort_saving_volume_index_006; + vi6->common.finish_restoring_volume_index = + finish_restoring_volume_index_006; vi6->common.finish_saving_volume_index = finish_saving_volume_index_006; vi6->common.free_volume_index = free_volume_index_006; - vi6->common.get_volume_index_memory_used = - get_volume_index_memory_used_006; vi6->common.get_volume_index_record = get_volume_index_record_006; vi6->common.get_volume_index_stats = get_volume_index_stats_006; vi6->common.get_volume_index_zone = get_volume_index_zone_006; vi6->common.is_volume_index_sample = is_volume_index_sample_006; - vi6->common.is_restoring_volume_index_done = - is_restoring_volume_index_done_006; - vi6->common.is_saving_volume_index_done = - is_saving_volume_index_done_006; vi6->common.lookup_volume_index_name = lookup_volume_index_name_006; vi6->common.lookup_volume_index_sampled_name = lookup_volume_index_sampled_name_006; - vi6->common.restore_delta_list_to_volume_index = - restore_delta_list_to_volume_index_006; vi6->common.set_volume_index_open_chapter = set_volume_index_open_chapter_006; vi6->common.set_volume_index_tag = set_volume_index_tag_006; @@ -818,14 +705,14 @@ int make_volume_index006(const struct configuration *config, start_restoring_volume_index_006; vi6->common.start_saving_volume_index = start_saving_volume_index_006; - vi6->num_zones = num_zones; + vi6->num_zones = config->zone_count; vi6->sparse_sample_rate = config->sparse_sample_rate; - result = UDS_ALLOCATE(num_zones, + result = UDS_ALLOCATE(config->zone_count, struct volume_index_zone, "volume index zones", &vi6->zones); - for (zone = 0; zone < num_zones; zone++) { + for (zone = 0; zone < config->zone_count; zone++) { if (result == UDS_SUCCESS) { result = uds_init_mutex(&vi6->zones[zone].hook_mutex); } @@ -836,7 +723,6 @@ int make_volume_index006(const struct configuration *config, } result = make_volume_index005(&split.non_hook_config, - num_zones, volume_nonce, &vi6->vi_non_hook); if (result != UDS_SUCCESS) { @@ -846,8 +732,9 @@ int make_volume_index006(const struct configuration *config, } set_volume_index_tag(vi6->vi_non_hook, 'd'); - result = make_volume_index005(&split.hook_config, num_zones, - volume_nonce, &vi6->vi_hook); + result = make_volume_index005(&split.hook_config, + volume_nonce, + &vi6->vi_hook); if (result != UDS_SUCCESS) { free_volume_index_006(&vi6->common); return uds_log_error_strerror(result, diff --git a/vdo/volume-index006.h b/vdo/volume-index006.h new file mode 100644 index 00000000..b0a970fa --- /dev/null +++ b/vdo/volume-index006.h @@ -0,0 +1,37 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright Red Hat + */ + +#ifndef VOLUMEINDEX006_H +#define VOLUMEINDEX006_H 1 + +#include "volume-index-ops.h" + +/** + * Make a new volume index. + * + * @param config The configuration of the volume index + * @param volume_nonce The nonce used to authenticate the index + * @param volume_index Location to hold new volume index ptr + * + * @return error code or UDS_SUCCESS + **/ +int __must_check make_volume_index006(const struct configuration *config, + uint64_t volume_nonce, + struct volume_index **volume_index); + +/** + * Compute the number of bytes required to save a volume index of a given + * configuration. + * + * @param config The configuration of the volume index + * @param num_bytes The number of bytes required to save the volume index + * + * @return UDS_SUCCESS or an error code. + **/ +int __must_check +compute_volume_index_save_bytes006(const struct configuration *config, + size_t *num_bytes); + +#endif /* VOLUMEINDEX006_H */ diff --git a/uds/volumeStore.c b/vdo/volume-store.c similarity index 61% rename from uds/volumeStore.c rename to vdo/volume-store.c index a371b1f0..0edb6780 100644 --- a/uds/volumeStore.c +++ b/vdo/volume-store.c @@ -1,31 +1,14 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/krusty/src/uds/volumeStore.c#21 $ */ #include "geometry.h" -#include "indexLayout.h" +#include "index-layout.h" #include "logger.h" -#include "volumeStore.h" +#include "volume-store.h" -/**********************************************************************/ void close_volume_store(struct volume_store *volume_store) { if (volume_store->vs_client != NULL) { @@ -34,21 +17,17 @@ void close_volume_store(struct volume_store *volume_store) } } -/**********************************************************************/ void destroy_volume_page(struct volume_page *volume_page) { release_volume_page(volume_page); } -/**********************************************************************/ -int initialize_volume_page(const struct geometry *geometry, - struct volume_page *volume_page) +int initialize_volume_page(size_t page_size, struct volume_page *volume_page) { volume_page->vp_buffer = NULL; return UDS_SUCCESS; } -/**********************************************************************/ int open_volume_store(struct volume_store *volume_store, struct index_layout *layout, unsigned int reserved_buffers __maybe_unused, @@ -58,7 +37,6 @@ int open_volume_store(struct volume_store *volume_store, &volume_store->vs_client); } -/**********************************************************************/ void prefetch_volume_pages(const struct volume_store *vs __maybe_unused, unsigned int physical_page __maybe_unused, unsigned int page_count __maybe_unused) @@ -66,7 +44,6 @@ void prefetch_volume_pages(const struct volume_store *vs __maybe_unused, dm_bufio_prefetch(vs->vs_client, physical_page, page_count); } -/**********************************************************************/ int prepare_to_write_volume_page(const struct volume_store *volume_store __maybe_unused, unsigned int physical_page __maybe_unused, @@ -75,6 +52,7 @@ int prepare_to_write_volume_page(const struct volume_store *volume_store { struct dm_buffer *buffer = NULL; byte *data; + release_volume_page(volume_page); data = dm_bufio_new(volume_store->vs_client, physical_page, &buffer); if (IS_ERR(data)) { @@ -84,12 +62,12 @@ int prepare_to_write_volume_page(const struct volume_store *volume_store return UDS_SUCCESS; } -/**********************************************************************/ int read_volume_page(const struct volume_store *volume_store, unsigned int physical_page, struct volume_page *volume_page) { byte *data; + release_volume_page(volume_page); data = dm_bufio_read(volume_store->vs_client, physical_page, &volume_page->vp_buffer); @@ -101,7 +79,6 @@ int read_volume_page(const struct volume_store *volume_store, return UDS_SUCCESS; } -/**********************************************************************/ void release_volume_page(struct volume_page *volume_page __maybe_unused) { if (volume_page->vp_buffer != NULL) { @@ -110,7 +87,6 @@ void release_volume_page(struct volume_page *volume_page __maybe_unused) } } -/**********************************************************************/ void swap_volume_pages(struct volume_page *volume_page1, struct volume_page *volume_page2) { @@ -119,7 +95,6 @@ void swap_volume_pages(struct volume_page *volume_page1, *volume_page2 = temp; } -/**********************************************************************/ int sync_volume_store(const struct volume_store *volume_store) { int result = -dm_bufio_write_dirty_buffers(volume_store->vs_client); @@ -130,7 +105,6 @@ int sync_volume_store(const struct volume_store *volume_store) return UDS_SUCCESS; } -/**********************************************************************/ int write_volume_page(const struct volume_store *volume_store, unsigned int physical_page, struct volume_page *volume_page) diff --git a/uds/volumeStore.h b/vdo/volume-store.h similarity index 80% rename from uds/volumeStore.h rename to vdo/volume-store.h index 1fe41c21..9c572b1f 100644 --- a/uds/volumeStore.h +++ b/vdo/volume-store.h @@ -1,22 +1,6 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/krusty/src/uds/volumeStore.h#8 $ */ #ifndef VOLUME_STORE_H @@ -24,11 +8,10 @@ #include "common.h" #include "compiler.h" -#include "memoryAlloc.h" +#include "memory-alloc.h" #include -struct geometry; struct index_layout; @@ -71,12 +54,12 @@ get_page_data(const struct volume_page *volume_page) /** * Initialize a volume page buffer. * - * @param geometry The volume geometry + * @param page_size The size of the page in bytes * @param volume_page The volume page buffer * * @return UDS_SUCCESS or an error status **/ -int __must_check initialize_volume_page(const struct geometry *geometry, +int __must_check initialize_volume_page(size_t page_size, struct volume_page *volume_page); /** diff --git a/uds/volume.c b/vdo/volume.c similarity index 75% rename from uds/volume.c rename to vdo/volume.c index a06be828..c4c6de40 100644 --- a/uds/volume.c +++ b/vdo/volume.c @@ -1,79 +1,41 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/krusty/src/uds/volume.c#58 $ */ #include "volume.h" -#include "cacheCounters.h" -#include "chapterIndex.h" +#include "chapter-index.h" #include "compiler.h" +#include "config.h" #include "errors.h" #include "geometry.h" -#include "hashUtils.h" -#include "indexConfig.h" +#include "hash-utils.h" +#include "index.h" #include "logger.h" -#include "memoryAlloc.h" +#include "memory-alloc.h" #include "permassert.h" -#include "recordPage.h" -#include "request.h" -#include "sparseCache.h" -#include "stringUtils.h" +#include "record-page.h" +#include "sparse-cache.h" +#include "string-utils.h" #include "uds-threads.h" enum { - MAX_BAD_CHAPTERS = 100, // max number of contiguous bad - // chapters - DEFAULT_VOLUME_READ_THREADS = 2, // Default number of reader threads - MAX_VOLUME_READ_THREADS = 16, // Maximum number of reader threads + MAX_BAD_CHAPTERS = 100, /* max number of contiguous bad chapters */ }; -/**********************************************************************/ -static unsigned int get_read_threads(const struct uds_parameters *user_params) -{ - unsigned int read_threads = - (user_params == NULL ? DEFAULT_VOLUME_READ_THREADS : - user_params->read_threads); - if (read_threads < 1) { - read_threads = 1; - } - if (read_threads > MAX_VOLUME_READ_THREADS) { - read_threads = MAX_VOLUME_READ_THREADS; - } - return read_threads; -} - -/**********************************************************************/ static INLINE unsigned int map_to_page_number(struct geometry *geometry, unsigned int physical_page) { return ((physical_page - 1) % geometry->pages_per_chapter); } -/**********************************************************************/ static INLINE unsigned int map_to_chapter_number(struct geometry *geometry, unsigned int physical_page) { return ((physical_page - 1) / geometry->pages_per_chapter); } -/**********************************************************************/ static INLINE bool is_record_page(struct geometry *geometry, unsigned int physical_page) { @@ -81,23 +43,22 @@ static INLINE bool is_record_page(struct geometry *geometry, geometry->index_pages_per_chapter); } -/**********************************************************************/ static INLINE unsigned int get_zone_number(struct uds_request *request) { return (request == NULL) ? 0 : request->zone_number; } -/**********************************************************************/ int map_to_physical_page(const struct geometry *geometry, int chapter, int page) { - // Page zero is the header page, so the first index page in the - // first chapter is physical page one. + /* + * Page zero is the header page, so the first index page in the + * first chapter is physical page one. + */ return (1 + (geometry->pages_per_chapter * chapter) + page); } -/**********************************************************************/ static void wait_for_read_queue_not_full(struct volume *volume, struct uds_request *request) { @@ -106,10 +67,12 @@ static void wait_for_read_queue_not_full(struct volume *volume, get_invalidate_counter(volume->page_cache, zone_number); if (search_pending(invalidate_counter)) { - // Increment the invalidate counter to avoid deadlock where the - // reader threads cannot make progress because they are waiting - // on the counter and the index thread cannot because the read - // queue is full. + /* + * Increment the invalidate counter to avoid deadlock where the + * reader threads cannot make progress because they are waiting + * on the counter and the index thread cannot because the read + * queue is full. + */ end_pending_search(volume->page_cache, zone_number); } @@ -121,29 +84,32 @@ static void wait_for_read_queue_not_full(struct volume *volume, } if (search_pending(invalidate_counter)) { - // Increment again so we get back to an odd value. + /* Increment again so we get back to an odd value. */ begin_pending_search(volume->page_cache, page_being_searched(invalidate_counter), zone_number); } } -/**********************************************************************/ int enqueue_page_read(struct volume *volume, struct uds_request *request, int physical_page) { int result; - // Don't allow new requests if we are shutting down, but make sure - // to process any requests that are still in the pipeline. + /* + * Don't allow new requests if we are shutting down, but make sure + * to process any requests that are still in the pipeline. + */ if ((volume->reader_state & READER_STATE_EXIT) != 0) { uds_log_info("failed to queue read while shutting down"); return -EBUSY; } - // Mark the page as queued in the volume cache, for chapter - // invalidation to be able to cancel a read. If we are unable to do - // this because the queues are full, flush them first + /* + * Mark the page as queued in the volume cache, for chapter + * invalidation to be able to cancel a read. If we are unable to do + * this because the queues are full, flush them first + */ while ((result = enqueue_read(volume->page_cache, request, physical_page)) == UDS_SUCCESS) { @@ -159,7 +125,6 @@ int enqueue_page_read(struct volume *volume, return result; } -/**********************************************************************/ static INLINE void wait_to_reserve_read_queue_entry(struct volume *volume, unsigned int *queue_pos, @@ -179,7 +144,6 @@ wait_to_reserve_read_queue_entry(struct volume *volume, } } -/**********************************************************************/ static int init_chapter_index_page(const struct volume *volume, byte *index_page, unsigned int chapter, @@ -188,7 +152,8 @@ static int init_chapter_index_page(const struct volume *volume, { uint64_t ci_virtual; unsigned int ci_chapter; - struct index_page_bounds bounds; + unsigned int lowest_list; + unsigned int highest_list; struct geometry *geometry = volume->geometry; int result = initialize_chapter_index_page(chapter_index_page, @@ -203,27 +168,26 @@ static int init_chapter_index_page(const struct volume *volume, chapter, index_page_number); } - result = get_list_number_bounds(volume->index_page_map, chapter, - index_page_number, &bounds); - if (result != UDS_SUCCESS) { - return result; - } - + get_list_number_bounds(volume->index_page_map, + chapter, + index_page_number, + &lowest_list, + &highest_list); ci_virtual = chapter_index_page->virtual_chapter_number; ci_chapter = map_to_physical_chapter(geometry, ci_virtual); if ((chapter == ci_chapter) && - (bounds.lowest_list == chapter_index_page->lowest_list_number) && - (bounds.highest_list == chapter_index_page->highest_list_number)) { + (lowest_list == chapter_index_page->lowest_list_number) && + (highest_list == chapter_index_page->highest_list_number)) { return UDS_SUCCESS; } uds_log_warning("Index page map updated to %llu", - (unsigned long long) get_last_update(volume->index_page_map)); + (unsigned long long) volume->index_page_map->last_update); uds_log_warning("Page map expects that chapter %u page %u has range %u to %u, but chapter index page has chapter %llu with range %u to %u", chapter, index_page_number, - bounds.lowest_list, - bounds.highest_list, + lowest_list, + highest_list, (unsigned long long) ci_virtual, chapter_index_page->lowest_list_number, chapter_index_page->highest_list_number); @@ -232,7 +196,6 @@ static int init_chapter_index_page(const struct volume *volume, "index page map mismatch with chapter index"); } -/**********************************************************************/ static int initialize_index_page(const struct volume *volume, unsigned int physical_page, struct cached_page *page) @@ -250,7 +213,47 @@ static int initialize_index_page(const struct volume *volume, return result; } -/**********************************************************************/ +static int search_page(struct cached_page *page, + const struct volume *volume, + struct uds_request *request, + bool record_page) +{ + int result; + enum uds_index_region location; + int record_page_number; + + if (record_page) { + if (search_record_page(get_page_data(&page->cp_page_data), + &request->chunk_name, + volume->geometry, + &request->old_metadata)) { + location = UDS_LOCATION_RECORD_PAGE_LOOKUP; + } else { + location = UDS_LOCATION_UNAVAILABLE; + } + } else { + result = search_chapter_index_page(&page->cp_index_page, + volume->geometry, + &request->chunk_name, + &record_page_number); + if (result != UDS_SUCCESS) { + return result; + } + + if (record_page_number == NO_CHAPTER_INDEX_ENTRY) { + location = UDS_LOCATION_UNAVAILABLE; + } else { + location = UDS_LOCATION_INDEX_PAGE_LOOKUP; + *((int *) &request->old_metadata) = + record_page_number; + } + } + + request->location = location; + request->found = false; + return UDS_SUCCESS; +} + static void read_thread_function(void *arg) { struct volume *volume = arg; @@ -265,6 +268,7 @@ static void read_thread_function(void *arg) bool record_page; struct cached_page *page = NULL; int result = UDS_SUCCESS; + wait_to_reserve_read_queue_entry(volume, &queue_pos, &request_list, @@ -279,8 +283,10 @@ static void read_thread_function(void *arg) record_page = is_record_page(volume->geometry, physical_page); if (!invalid) { - // Find a place to put the read queue page we reserved - // above. + /* + * Find a place to put the read queue page we reserved + * above. + */ result = select_victim_in_cache(volume->page_cache, &page); if (result == UDS_SUCCESS) { @@ -348,36 +354,31 @@ static void read_thread_function(void *arg) while (request_list != NULL) { struct uds_request *request = request_list; + request_list = request->next_request; /* * If we've read in a record page, we're going to do an - * immediate search, in an attempt to speed up - * processing when we requeue the request, so that it - * doesn't have to go back into the - * get_record_from_zone code again. However, if we've - * just read in an index page, we don't want to search. - * We want the request to be processed again and - * get_record_from_zone to be run. We have added new - * fields in request to allow the index code to know - * whether it can stop processing before - * get_record_from_zone is called again. + * immediate search, to speed up processing by avoiding + * get_record_from_zone, and to ensure that requests + * make progress even when queued. If we've read in an + * index page, we save the record page number so we + * don't have to resolve the index page again. We use + * the location, virtual_chapter, and old_metadata + * fields in the request to allow the index code to + * know where to begin processing the request again. */ - if ((result == UDS_SUCCESS) && (page != NULL) && - record_page) { - if (search_record_page(get_page_data(&page->cp_page_data), - &request->chunk_name, - volume->geometry, - &request->old_metadata)) { - request->location = UDS_LOCATION_IN_DENSE; - } else { - request->location = UDS_LOCATION_UNAVAILABLE; - } + if ((result == UDS_SUCCESS) && (page != NULL)) { + result = search_page(page, + volume, + request, + record_page); } - // reflect any read failures in the request status + /* reflect any read failures in the request status */ request->status = result; - restart_request(request); + request->requeued = true; + enqueue_request(request, STAGE_INDEX); } release_read_queue_entry(volume->page_cache, queue_pos); @@ -389,22 +390,17 @@ static void read_thread_function(void *arg) uds_log_debug("reader done"); } -/**********************************************************************/ static int read_page_locked(struct volume *volume, struct uds_request *request, unsigned int physical_page, - bool sync_read, struct cached_page **page_ptr) { int result = UDS_SUCCESS; struct cached_page *page = NULL; - - sync_read |= ((volume->lookup_mode == LOOKUP_FOR_REBUILD) || - (request == NULL) || (request->session == NULL)); - + bool sync_read = ((request == NULL) || (request->session == NULL)); if (sync_read) { - // Find a place to put the page. + /* Find a place to put the page. */ result = select_victim_in_cache(volume->page_cache, &page); if (result != UDS_SUCCESS) { uds_log_warning("Error selecting cache victim for page read"); @@ -456,27 +452,23 @@ static int read_page_locked(struct volume *volume, return UDS_SUCCESS; } -/**********************************************************************/ int get_volume_page_locked(struct volume *volume, - struct uds_request *request, unsigned int physical_page, - enum cache_probe_type probe_type, struct cached_page **page_ptr) { struct cached_page *page = NULL; - int result = get_page_from_cache(volume->page_cache, physical_page, - probe_type, &page); + int result = get_page_from_cache(volume->page_cache, + physical_page, + &page); if (result != UDS_SUCCESS) { return result; } if (page == NULL) { - result = read_page_locked(volume, request, physical_page, true, - &page); + result = read_page_locked(volume, NULL, physical_page, &page); if (result != UDS_SUCCESS) { return result; } - } else if (get_zone_number(request) == 0) { - // Only 1 zone is responsible for updating LRU + } else { make_page_most_recent(volume->page_cache, page); } @@ -484,11 +476,9 @@ int get_volume_page_locked(struct volume *volume, return UDS_SUCCESS; } -/**********************************************************************/ int get_volume_page_protected(struct volume *volume, struct uds_request *request, unsigned int physical_page, - enum cache_probe_type probe_type, struct cached_page **page_ptr) { unsigned int zone_number; @@ -496,15 +486,16 @@ int get_volume_page_protected(struct volume *volume, int result = get_page_from_cache(volume->page_cache, physical_page, - probe_type | CACHE_PROBE_IGNORE_FAILURE, &page); if (result != UDS_SUCCESS) { return result; } zone_number = get_zone_number(request); - // If we didn't find a page we need to enqueue a read for it, in which - // case we need to grab the mutex. + /* + * If we didn't find a page we need to enqueue a read for it, in which + * case we need to grab the mutex. + */ if (page == NULL) { end_pending_search(volume->page_cache, zone_number); uds_lock_mutex(&volume->read_threads_mutex); @@ -519,8 +510,9 @@ int get_volume_page_protected(struct volume *volume, * already in the cache, which would mean we end up with two * entries in the cache for the same page. */ - result = get_page_from_cache(volume->page_cache, physical_page, - probe_type, &page); + result = get_page_from_cache(volume->page_cache, + physical_page, + &page); if (result != UDS_SUCCESS) { /* * In non-success cases (anything not UDS_SUCCESS, @@ -546,8 +538,10 @@ int get_volume_page_protected(struct volume *volume, return result; } - // If we found the page now, we can release the mutex and - // proceed as if this were the fast case. + /* + * If we found the page now, we can release the mutex and + * proceed as if this were the fast case. + */ if (page != NULL) { /* * If we found a page (*page_ptr != NULL and return @@ -566,7 +560,7 @@ int get_volume_page_protected(struct volume *volume, if (page == NULL) { result = read_page_locked(volume, request, physical_page, - false, &page); + &page); if (result != UDS_SUCCESS) { /* * This code path is used frequently in the UDS_QUEUED @@ -581,13 +575,13 @@ int get_volume_page_protected(struct volume *volume, return result; } - // See above re: ordering requirement. + /* See above re: ordering requirement. */ begin_pending_search(volume->page_cache, physical_page, zone_number); uds_unlock_mutex(&volume->read_threads_mutex); } else { if (get_zone_number(request) == 0) { - // Only 1 zone is responsible for updating LRU + /* Only 1 zone is responsible for updating LRU */ make_page_most_recent(volume->page_cache, page); } } @@ -596,11 +590,9 @@ int get_volume_page_protected(struct volume *volume, return UDS_SUCCESS; } -/**********************************************************************/ int get_volume_page(struct volume *volume, unsigned int chapter, unsigned int page_number, - enum cache_probe_type probe_type, byte **data_ptr, struct delta_index_page **index_page_ptr) { @@ -610,8 +602,7 @@ int get_volume_page(struct volume *volume, map_to_physical_page(volume->geometry, chapter, page_number); uds_lock_mutex(&volume->read_threads_mutex); - result = get_volume_page_locked(volume, NULL, physical_page, - probe_type, &page); + result = get_volume_page_locked(volume, physical_page, &page); uds_unlock_mutex(&volume->read_threads_mutex); if (data_ptr != NULL) { @@ -667,7 +658,6 @@ static int search_cached_index_page(struct volume *volume, result = get_volume_page_protected(volume, request, physical_page, - cache_probe_type(request, true), &page); if (result != UDS_SUCCESS) { end_pending_search(volume->page_cache, zone_number); @@ -689,7 +679,6 @@ static int search_cached_index_page(struct volume *volume, return result; } -/**********************************************************************/ int search_cached_record_page(struct volume *volume, struct uds_request *request, const struct uds_chunk_name *name, @@ -706,7 +695,7 @@ int search_cached_record_page(struct volume *volume, *found = false; if (record_page_number == NO_CHAPTER_INDEX_ENTRY) { - // No record for that name can exist in the chapter. + /* No record for that name can exist in the chapter. */ return UDS_SUCCESS; } @@ -738,7 +727,6 @@ int search_cached_record_page(struct volume *volume, result = get_volume_page_protected(volume, request, physical_page, - cache_probe_type(request, false), &record_page); if (result != UDS_SUCCESS) { end_pending_search(volume->page_cache, zone_number); @@ -755,7 +743,6 @@ int search_cached_record_page(struct volume *volume, return UDS_SUCCESS; } -/**********************************************************************/ int read_chapter_index_from_volume(const struct volume *volume, uint64_t virtual_chapter, struct volume_page volume_pages[], @@ -773,7 +760,8 @@ int read_chapter_index_from_volume(const struct volume *volume, physical_page, geometry->index_pages_per_chapter); - result = initialize_volume_page(geometry, &volume_page); + result = initialize_volume_page(geometry->bytes_per_page, + &volume_page); for (i = 0; i < geometry->index_pages_per_chapter; i++) { byte *index_page; int result = read_volume_page(&volume->volume_store, @@ -796,7 +784,6 @@ int read_chapter_index_from_volume(const struct volume *volume, return result; } -/**********************************************************************/ int search_volume_page_cache(struct volume *volume, struct uds_request *request, const struct uds_chunk_name *name, @@ -804,41 +791,41 @@ int search_volume_page_cache(struct volume *volume, struct uds_chunk_data *metadata, bool *found) { + int result; unsigned int physical_chapter = map_to_physical_chapter(volume->geometry, virtual_chapter); unsigned int index_page_number; int record_page_number; - int result = find_index_page_number(volume->index_page_map, - name, - physical_chapter, - &index_page_number); - if (result != UDS_SUCCESS) { - return result; - } - result = search_cached_index_page(volume, - request, - name, - physical_chapter, - index_page_number, - &record_page_number); - if (result == UDS_SUCCESS) { - result = search_cached_record_page(volume, - request, + index_page_number = find_index_page_number(volume->index_page_map, name, - physical_chapter, - record_page_number, - metadata, - found); + physical_chapter); + + if ((request != NULL) && + (request->location == UDS_LOCATION_INDEX_PAGE_LOOKUP)) { + record_page_number = *((int *) &request->old_metadata); + } else { + result = search_cached_index_page(volume, + request, + name, + physical_chapter, + index_page_number, + &record_page_number); + if (result != UDS_SUCCESS) { + return result; + } } - return result; + return search_cached_record_page(volume, + request, + name, + physical_chapter, + record_page_number, + metadata, + found); } -/**********************************************************************/ -int forget_chapter(struct volume *volume, - uint64_t virtual_chapter, - enum invalidation_reason reason) +int forget_chapter(struct volume *volume, uint64_t virtual_chapter) { int result; unsigned int physical_chapter = @@ -848,8 +835,7 @@ int forget_chapter(struct volume *volume, uds_lock_mutex(&volume->read_threads_mutex); result = invalidate_page_cache_for_chapter(volume->page_cache, physical_chapter, - volume->geometry->pages_per_chapter, - reason); + volume->geometry->pages_per_chapter); uds_unlock_mutex(&volume->read_threads_mutex); return result; } @@ -873,14 +859,15 @@ static int donate_index_page_locked(struct volume *volume, physical_chapter, index_page_number); - // Find a place to put the page. + /* Find a place to put the page. */ struct cached_page *page = NULL; int result = select_victim_in_cache(volume->page_cache, &page); + if (result != UDS_SUCCESS) { return result; } - // Exchange the scratch page with the cache page + /* Exchange the scratch page with the cache page */ swap_volume_pages(&page->cp_page_data, scratch_page); result = init_chapter_index_page(volume, @@ -904,7 +891,6 @@ static int donate_index_page_locked(struct volume *volume, return UDS_SUCCESS; } -/**********************************************************************/ int write_index_pages(struct volume *volume, int physical_page, struct open_chapter_index *chapter_index, @@ -917,6 +903,7 @@ int write_index_pages(struct volume *volume, unsigned int delta_list_number = 0; unsigned int index_page_number; + for (index_page_number = 0; index_page_number < geometry->index_pages_per_chapter; index_page_number++) { @@ -932,7 +919,7 @@ int write_index_pages(struct volume *volume, "failed to prepare index page"); } - // Pack as many delta lists into the index page as will fit. + /* Pack as many delta lists into the index page as will fit. */ last_page = ((index_page_number + 1) == geometry->index_pages_per_chapter); result = pack_open_chapter_index_page(chapter_index, @@ -959,8 +946,10 @@ int write_index_pages(struct volume *volume, geometry->bytes_per_page); } - // Tell the index page map the list number of the last delta - // list that was packed into the index page. + /* + * Tell the index page map the list number of the last delta + * list that was packed into the index page. + */ if (lists_packed == 0) { uds_log_debug("no delta lists packed on chapter %u page %u", physical_chapter_number, @@ -968,17 +957,14 @@ int write_index_pages(struct volume *volume, } else { delta_list_number += lists_packed; } - result = update_index_page_map(volume->index_page_map, - chapter_index->virtual_chapter_number, - physical_chapter_number, - index_page_number, - delta_list_number - 1); - if (result != UDS_SUCCESS) { - return uds_log_error_strerror(result, - "failed to update index page map"); - } - // Donate the page data for the index page to the page cache. + update_index_page_map(volume->index_page_map, + chapter_index->virtual_chapter_number, + physical_chapter_number, + index_page_number, + delta_list_number - 1); + + /* Donate the page data for the index page to the page cache. */ uds_lock_mutex(&volume->read_threads_mutex); result = donate_index_page_locked(volume, physical_chapter_number, @@ -992,7 +978,6 @@ int write_index_pages(struct volume *volume, return UDS_SUCCESS; } -/**********************************************************************/ int write_record_pages(struct volume *volume, int physical_page, const struct uds_chunk_record records[], @@ -1000,9 +985,9 @@ int write_record_pages(struct volume *volume, { unsigned int record_page_number; struct geometry *geometry = volume->geometry; - // The record array from the open chapter is 1-based. + /* The record array from the open chapter is 1-based. */ const struct uds_chunk_record *next_record = &records[1]; - // Skip over the index pages, which come before the record pages + /* Skip over the index pages, which come before the record pages */ physical_page += geometry->index_pages_per_chapter; for (record_page_number = 0; @@ -1018,8 +1003,10 @@ int write_record_pages(struct volume *volume, "failed to prepare record page"); } - // Sort the next page of records and copy them to the record - // page as a binary tree stored in heap order. + /* + * Sort the next page of records and copy them to the record + * page as a binary tree stored in heap order. + */ result = encode_record_page(volume, next_record, get_page_data(&volume->scratch_page)); if (result != UDS_SUCCESS) { @@ -1046,12 +1033,11 @@ int write_record_pages(struct volume *volume, return UDS_SUCCESS; } -/**********************************************************************/ int write_chapter(struct volume *volume, struct open_chapter_index *chapter_index, const struct uds_chunk_record records[]) { - // Determine the position of the virtual chapter in the volume file. + /* Determine the position of the virtual chapter in the volume file. */ struct geometry *geometry = volume->geometry; unsigned int physical_chapter_number = map_to_physical_chapter(geometry, @@ -1059,33 +1045,32 @@ int write_chapter(struct volume *volume, int physical_page = map_to_physical_page(geometry, physical_chapter_number, 0); - // Pack and write the delta chapter index pages to the volume. + /* Pack and write the delta chapter index pages to the volume. */ int result = write_index_pages(volume, physical_page, chapter_index, NULL); if (result != UDS_SUCCESS) { return result; } - // Sort and write the record pages to the volume. + /* Sort and write the record pages to the volume. */ result = write_record_pages(volume, physical_page, records, NULL); if (result != UDS_SUCCESS) { return result; } release_volume_page(&volume->scratch_page); - // Flush the data to permanent storage. + /* Flush the data to permanent storage. */ return sync_volume_store(&volume->volume_store); } -/**********************************************************************/ size_t get_cache_size(struct volume *volume) { size_t size = get_page_cache_size(volume->page_cache); - if (is_sparse(volume->geometry)) { + + if (is_sparse_geometry(volume->geometry)) { size += get_sparse_cache_memory_size(volume->sparse_cache); } return size; } -/**********************************************************************/ static int probe_chapter(struct volume *volume, unsigned int chapter_number, uint64_t *virtual_chapter_number) @@ -1104,7 +1089,6 @@ static int probe_chapter(struct volume *volume, int result = get_volume_page(volume, chapter_number, i, - CACHE_PROBE_INDEX_FIRST, NULL, &page); if (result != UDS_SUCCESS) { @@ -1120,14 +1104,14 @@ static int probe_chapter(struct volume *volume, i, (unsigned long long) last_vcn, (unsigned long long) vcn); - return UDS_CORRUPT_COMPONENT; + return UDS_CORRUPT_DATA; } if (expected_list_number != page->lowest_list_number) { uds_log_error("inconsistent chapter %u index page %u: expected list number %u, got list number %u", chapter_number, i, expected_list_number, page->lowest_list_number); - return UDS_CORRUPT_COMPONENT; + return UDS_CORRUPT_DATA; } expected_list_number = page->highest_list_number + 1; @@ -1140,20 +1124,19 @@ static int probe_chapter(struct volume *volume, if (last_vcn == UINT64_MAX) { uds_log_error("no chapter %u virtual chapter number determined", chapter_number); - return UDS_CORRUPT_COMPONENT; + return UDS_CORRUPT_DATA; } if (chapter_number != map_to_physical_chapter(geometry, last_vcn)) { uds_log_error("chapter %u vcn %llu is out of phase (%u)", chapter_number, (unsigned long long) last_vcn, geometry->chapters_per_volume); - return UDS_CORRUPT_COMPONENT; + return UDS_CORRUPT_DATA; } *virtual_chapter_number = last_vcn; return UDS_SUCCESS; } -/**********************************************************************/ static int probe_wrapper(void *aux, unsigned int chapter_number, uint64_t *virtual_chapter_number) @@ -1161,15 +1144,13 @@ static int probe_wrapper(void *aux, struct volume *volume = aux; int result = probe_chapter(volume, chapter_number, virtual_chapter_number); - if ((result == UDS_CORRUPT_COMPONENT) || - (result == UDS_CORRUPT_DATA)) { + if (result == UDS_CORRUPT_DATA) { *virtual_chapter_number = UINT64_MAX; return UDS_SUCCESS; } return result; } -/**********************************************************************/ static int find_real_end_of_volume(struct volume *volume, unsigned int limit, unsigned int *limit_ptr) @@ -1182,17 +1163,19 @@ static int find_real_end_of_volume(struct volume *volume, */ unsigned int span = 1; unsigned int tries = 0; + while (limit > 0) { unsigned int chapter = (span > limit) ? 0 : limit - span; uint64_t vcn = 0; int result = probe_chapter(volume, chapter, &vcn); + if (result == UDS_SUCCESS) { if (span == 1) { break; } span /= 2; tries = 0; - } else if (result == UDS_CORRUPT_COMPONENT) { + } else if (result == UDS_CORRUPT_DATA) { limit = chapter; if (++tries > 1) { span *= 2; @@ -1209,7 +1192,6 @@ static int find_real_end_of_volume(struct volume *volume, return UDS_SUCCESS; } -/**********************************************************************/ int find_volume_chapter_boundaries(struct volume *volume, uint64_t *lowest_vcn, uint64_t *highest_vcn, @@ -1241,7 +1223,6 @@ int find_volume_chapter_boundaries(struct volume *volume, volume); } -/**********************************************************************/ int find_volume_chapter_boundaries_impl(unsigned int chapter_limit, unsigned int max_bad_chapters, uint64_t *lowest_vcn, @@ -1274,7 +1255,7 @@ int find_volume_chapter_boundaries_impl(unsigned int chapter_limit, * immediately preceeds the lowest one. */ - // It doesn't matter if this results in a bad spot (UINT64_MAX). + /* It doesn't matter if this results in a bad spot (UINT64_MAX). */ result = (*probe_func)(aux, 0, &zero_vcn); if (result != UDS_SUCCESS) { return result; @@ -1293,6 +1274,7 @@ int find_volume_chapter_boundaries_impl(unsigned int chapter_limit, */ if (geometry->remapped_physical > 0) { uint64_t remapped_vcn; + result = (*probe_func)(aux, geometry->remapped_physical, &remapped_vcn); @@ -1311,6 +1293,7 @@ int find_volume_chapter_boundaries_impl(unsigned int chapter_limit, while (left_chapter < right_chapter) { uint64_t probe_vcn; unsigned int chapter = (left_chapter + right_chapter) / 2; + if (chapter == moved_chapter) { chapter--; } @@ -1334,17 +1317,19 @@ int find_volume_chapter_boundaries_impl(unsigned int chapter_limit, return result; } - left_chapter %= chapter_limit; // in case we're at the end + left_chapter %= chapter_limit; /* in case we're at the end */ - // At this point, left_chapter is the chapter with the lowest virtual - // chapter number. + /* + * At this point, left_chapter is the chapter with the lowest virtual + * chapter number. + */ result = (*probe_func)(aux, left_chapter, &lowest); if (result != UDS_SUCCESS) { return result; } - // The moved chapter might be the lowest in the range. + /* The moved chapter might be the lowest in the range. */ if ((moved_chapter != UINT64_MAX) && (lowest == geometry->remapped_virtual + 1)) { lowest = geometry->remapped_virtual; @@ -1355,9 +1340,11 @@ int find_volume_chapter_boundaries_impl(unsigned int chapter_limit, return result; } - // We now circularly scan backwards, moving over any bad chapters until - // we find the chapter with the highest vcn (the first good chapter we - // encounter). + /* + * We now circularly scan backwards, moving over any bad chapters until + * we find the chapter with the highest vcn (the first good chapter we + * encounter). + */ while (highest == UINT64_MAX) { right_chapter = @@ -1372,7 +1359,7 @@ int find_volume_chapter_boundaries_impl(unsigned int chapter_limit, if (bad_chapters++ >= max_bad_chapters) { uds_log_error("too many bad chapters in volume: %u", bad_chapters); - return UDS_CORRUPT_COMPONENT; + return UDS_CORRUPT_DATA; } } @@ -1384,28 +1371,25 @@ int find_volume_chapter_boundaries_impl(unsigned int chapter_limit, /** * Allocate a volume. * - * @param config The configuration to use - * @param layout The index layout - * @param read_queue_max_size The maximum size of the read queue - * @param zone_count The number of zones to use - * @param new_volume A pointer to hold the new volume + * @param config The configuration to use + * @param layout The index layout + * @param new_volume A pointer to hold the new volume * * @return UDS_SUCCESS or an error code **/ static int __must_check allocate_volume(const struct configuration *config, struct index_layout *layout, - unsigned int read_queue_max_size, - unsigned int zone_count, struct volume **new_volume) { struct volume *volume; + struct geometry *geometry; unsigned int reserved_buffers; int result = UDS_ALLOCATE(1, struct volume, "volume", &volume); + if (result != UDS_SUCCESS) { return result; } volume->nonce = get_uds_volume_nonce(layout); - // It is safe to call free_volume now to clean up and close the volume result = copy_geometry(config->geometry, &volume->geometry); if (result != UDS_SUCCESS) { @@ -1413,41 +1397,42 @@ static int __must_check allocate_volume(const struct configuration *config, return uds_log_warning_strerror(result, "failed to allocate geometry: error"); } + geometry = volume->geometry; - // Need a buffer for each entry in the page cache - reserved_buffers = config->cache_chapters * - config->geometry->record_pages_per_chapter; - // And a buffer for the chapter writer + /* Need a buffer for each entry in the page cache */ + reserved_buffers = + config->cache_chapters * geometry->record_pages_per_chapter; + /* And a buffer for the chapter writer */ reserved_buffers += 1; - // And a buffer for each entry in the sparse cache - if (is_sparse(volume->geometry)) { + /* And a buffer for each entry in the sparse cache */ + if (is_sparse_geometry(geometry)) { reserved_buffers += (config->cache_chapters * - config->geometry->index_pages_per_chapter); + geometry->index_pages_per_chapter); } volume->reserved_buffers = reserved_buffers; result = open_volume_store(&volume->volume_store, layout, volume->reserved_buffers, - config->geometry->bytes_per_page); + geometry->bytes_per_page); if (result != UDS_SUCCESS) { free_volume(volume); return result; } - result = initialize_volume_page(config->geometry, + result = initialize_volume_page(geometry->bytes_per_page, &volume->scratch_page); if (result != UDS_SUCCESS) { free_volume(volume); return result; } - result = make_radix_sorter(config->geometry->records_per_page, + result = make_radix_sorter(geometry->records_per_page, &volume->radix_sorter); if (result != UDS_SUCCESS) { free_volume(volume); return result; } - result = UDS_ALLOCATE(config->geometry->records_per_page, + result = UDS_ALLOCATE(geometry->records_per_page, const struct uds_chunk_record *, "record pointers", &volume->record_pointers); @@ -1456,27 +1441,25 @@ static int __must_check allocate_volume(const struct configuration *config, return result; } - if (is_sparse(volume->geometry)) { - result = make_sparse_cache(volume->geometry, + if (is_sparse_geometry(geometry)) { + result = make_sparse_cache(geometry, config->cache_chapters, - zone_count, + config->zone_count, &volume->sparse_cache); if (result != UDS_SUCCESS) { free_volume(volume); return result; } } - result = make_page_cache(volume->geometry, + result = make_page_cache(geometry, config->cache_chapters, - read_queue_max_size, - zone_count, + config->zone_count, &volume->page_cache); if (result != UDS_SUCCESS) { free_volume(volume); return result; } - result = - make_index_page_map(volume->geometry, &volume->index_page_map); + result = make_index_page_map(geometry, &volume->index_page_map); if (result != UDS_SUCCESS) { free_volume(volume); return result; @@ -1486,7 +1469,6 @@ static int __must_check allocate_volume(const struct configuration *config, return UDS_SUCCESS; } -/**********************************************************************/ int __must_check replace_volume_storage(struct volume *volume, struct index_layout *layout, const char *name) @@ -1510,26 +1492,15 @@ int __must_check replace_volume_storage(struct volume *volume, volume->geometry->bytes_per_page); } -/**********************************************************************/ int make_volume(const struct configuration *config, struct index_layout *layout, - const struct uds_parameters *user_params, - unsigned int read_queue_max_size, - unsigned int zone_count, struct volume **new_volume) { unsigned int i; - unsigned int volume_read_threads = get_read_threads(user_params); struct volume *volume = NULL; int result; - if (read_queue_max_size <= volume_read_threads) { - uds_log_error("Number of read threads must be smaller than read queue"); - return UDS_INVALID_ARGUMENT; - } - - result = allocate_volume(config, layout, read_queue_max_size, - zone_count, &volume); + result = allocate_volume(config, layout, &volume); if (result != UDS_SUCCESS) { return result; } @@ -1549,9 +1520,11 @@ int make_volume(const struct configuration *config, return result; } - // Start the reader threads. If this allocation succeeds, free_volume - // knows that it needs to try and stop those threads. - result = UDS_ALLOCATE(volume_read_threads, + /* + * Start the reader threads. If this allocation succeeds, free_volume + * knows that it needs to try and stop those threads. + */ + result = UDS_ALLOCATE(config->read_threads, struct thread *, "reader threads", &volume->reader_threads); @@ -1559,7 +1532,7 @@ int make_volume(const struct configuration *config, free_volume(volume); return result; } - for (i = 0; i < volume_read_threads; i++) { + for (i = 0; i < config->read_threads; i++) { result = uds_create_thread(read_thread_function, (void *) volume, "reader", @@ -1568,7 +1541,7 @@ int make_volume(const struct configuration *config, free_volume(volume); return result; } - // We only stop as many threads as actually got started. + /* We only stop as many threads as actually got started. */ volume->num_read_threads = i + 1; } @@ -1576,19 +1549,22 @@ int make_volume(const struct configuration *config, return UDS_SUCCESS; } -/**********************************************************************/ void free_volume(struct volume *volume) { if (volume == NULL) { return; } - // If reader_threads is NULL, then we haven't set up the reader - // threads. + /* + * If reader_threads is NULL, then we haven't set up the reader + * threads. + */ if (volume->reader_threads != NULL) { unsigned int i; - // Stop the reader threads. It is ok if there aren't any of - // them. + /* + * Stop the reader threads. It is ok if there aren't any of + * them. + */ uds_lock_mutex(&volume->read_threads_mutex); volume->reader_state |= READER_STATE_EXIT; uds_broadcast_cond(&volume->read_threads_cond); @@ -1600,8 +1576,10 @@ void free_volume(struct volume *volume) volume->reader_threads = NULL; } - // Must close the volume store AFTER freeing the scratch page and the - // caches + /* + * Must close the volume store AFTER freeing the scratch page and the + * caches + */ destroy_volume_page(&volume->scratch_page); free_page_cache(volume->page_cache); free_sparse_cache(volume->sparse_cache); diff --git a/uds/volume.h b/vdo/volume.h similarity index 82% rename from uds/volume.h rename to vdo/volume.h index f9e3bf62..3a9a8d5e 100644 --- a/uds/volume.h +++ b/vdo/volume.h @@ -1,39 +1,22 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/krusty/src/uds/volume.h#29 $ */ #ifndef VOLUME_H #define VOLUME_H -#include "cacheCounters.h" #include "common.h" -#include "chapterIndex.h" -#include "indexConfig.h" -#include "indexLayout.h" -#include "indexPageMap.h" -#include "pageCache.h" -#include "request.h" -#include "sparseCache.h" +#include "config.h" +#include "chapter-index.h" +#include "index-layout.h" +#include "index-page-map.h" +#include "page-cache.h" +#include "radix-sort.h" +#include "sparse-cache.h" #include "uds.h" -#include "util/radixSort.h" -#include "volumeStore.h" +#include "uds-threads.h" +#include "volume-store.h" enum reader_state { READER_STATE_RUN = 1, @@ -44,26 +27,13 @@ enum reader_state { enum index_lookup_mode { /* Always do lookups in all chapters normally. */ LOOKUP_NORMAL, - /* - * Don't do lookups in closed chapters; assume records not in the - * open chapter are always new. You don't want this normally; it's - * for programs like albfill. (Even then, with multiple runs using - * the same tag, we may actually duplicate older records, but if - * it's in a separate chapter it won't really matter.) - */ - LOOKUP_CURRENT_CHAPTER_ONLY, - /* - * Only do a subset of lookups needed when rebuilding an index. - * This cannot be set externally. - */ - LOOKUP_FOR_REBUILD + /* Only do a subset of lookups needed when rebuilding an index. */ + LOOKUP_FOR_REBUILD, }; struct volume { /* The layout of the volume */ struct geometry *geometry; - /* The configuration of the volume */ - struct configuration *config; /* The access to the volume's backing store */ struct volume_store volume_store; /* A single page used for writing to the volume */ @@ -103,21 +73,14 @@ struct volume { /** * Create a volume. * - * @param config The configuration to use. - * @param layout The index layout - * @param user_params The index session parameters. If NULL, the - * default session parameters will be used. - * @param read_queue_max_size The maximum size of the read queue. - * @param zone_count The number of zones to use. - * @param new_volume A pointer to hold a pointer to the new volume. + * @param config The configuration to use. + * @param layout The index layout + * @param new_volume A pointer to hold a pointer to the new volume. * * @return UDS_SUCCESS or an error code **/ int __must_check make_volume(const struct configuration *config, struct index_layout *layout, - const struct uds_parameters *user_params, - unsigned int read_queue_max_size, - unsigned int zone_count, struct volume **new_volume); /** @@ -240,13 +203,10 @@ int __must_check search_cached_record_page(struct volume *volume, * * @param volume the volume containing the chapter * @param chapter the virtual chapter number - * @param reason the reason for invalidation * * @return UDS_SUCCESS or an error code **/ -int __must_check forget_chapter(struct volume *volume, - uint64_t chapter, - enum invalidation_reason reason); +int __must_check forget_chapter(struct volume *volume, uint64_t chapter); /** * Write a chapter's worth of index pages to a volume @@ -326,17 +286,13 @@ read_chapter_index_from_volume(const struct volume *volume, * This function is only exposed for the use of unit tests. * * @param volume The volume containing the page - * @param request The request originating the search * @param physical_page The physical page number - * @param probe_type The type of cache access being done * @param entry_ptr A pointer to hold the retrieved cached entry * * @return UDS_SUCCESS or an error code **/ int __must_check get_volume_page_locked(struct volume *volume, - struct uds_request *request, unsigned int physical_page, - enum cache_probe_type probe_type, struct cached_page **entry_ptr); /** @@ -359,7 +315,6 @@ int __must_check get_volume_page_locked(struct volume *volume, * @param volume The volume containing the page * @param request The request originating the search * @param physical_page The physical page number - * @param probe_type The type of cache access being done * @param entry_ptr A pointer to hold the retrieved cached entry * * @return UDS_SUCCESS or an error code @@ -367,7 +322,6 @@ int __must_check get_volume_page_locked(struct volume *volume, int __must_check get_volume_page_protected(struct volume *volume, struct uds_request *request, unsigned int physical_page, - enum cache_probe_type probe_type, struct cached_page **entry_ptr); /** @@ -389,7 +343,6 @@ int __must_check get_volume_page_protected(struct volume *volume, * @param volume The volume containing the page * @param chapter The number of the chapter containing the page * @param page_number The number of the page - * @param probe_type The type of cache access being done * @param data_ptr Pointer to hold the retrieved page, NULL if not * wanted * @param index_page_ptr Pointer to hold the retrieved chapter index page, or @@ -400,14 +353,11 @@ int __must_check get_volume_page_protected(struct volume *volume, int __must_check get_volume_page(struct volume *volume, unsigned int chapter, unsigned int page_number, - enum cache_probe_type probe_type, byte **data_ptr, struct delta_index_page **index_page_ptr); -/**********************************************************************/ size_t __must_check get_cache_size(struct volume *volume); -/**********************************************************************/ int __must_check find_volume_chapter_boundaries_impl(unsigned int chapter_limit, unsigned int max_bad_chapters, diff --git a/vdo/volumeGeometry.h b/vdo/volumeGeometry.h deleted file mode 100644 index 3da51332..00000000 --- a/vdo/volumeGeometry.h +++ /dev/null @@ -1,161 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/volumeGeometry.h#14 $ - */ - -#ifndef VOLUME_GEOMETRY_H -#define VOLUME_GEOMETRY_H - - -#include -#include - -#include "uds.h" - -#include "types.h" - -enum { - GEOMETRY_BLOCK_LOCATION = 0, -}; - -struct index_config { - uint32_t mem; - uint32_t checkpoint_frequency; - bool sparse; -} __packed; - -enum volume_region_id { - INDEX_REGION = 0, - DATA_REGION = 1, - VOLUME_REGION_COUNT, -}; - -struct volume_region { - /** The ID of the region */ - enum volume_region_id id; - /** - * The absolute starting offset on the device. The region continues - * until the next region begins. - */ - physical_block_number_t start_block; -} __packed; - -struct volume_geometry { - /** The release version number of this volume */ - release_version_number_t release_version; - /** The nonce of this volume */ - nonce_t nonce; - /** The uuid of this volume */ - uuid_t uuid; - /** The block offset to be applied to bios */ - block_count_t bio_offset; - /** The regions in ID order */ - struct volume_region regions[VOLUME_REGION_COUNT]; - /** The index config */ - struct index_config index_config; -} __packed; - -/** This volume geometry struct is used for sizing only */ -struct volume_geometry_4_0 { - /** The release version number of this volume */ - release_version_number_t release_version; - /** The nonce of this volume */ - nonce_t nonce; - /** The uuid of this volume */ - uuid_t uuid; - /** The regions in ID order */ - struct volume_region regions[VOLUME_REGION_COUNT]; - /** The index config */ - struct index_config index_config; -} __packed; - -/** - * Get the start of the index region from a geometry. - * - * @param geometry The geometry - * - * @return The start of the index region - **/ -static inline physical_block_number_t __must_check -vdo_get_index_region_start(struct volume_geometry geometry) -{ - return geometry.regions[INDEX_REGION].start_block; -} - -/** - * Get the start of the data region from a geometry. - * - * @param geometry The geometry - * - * @return The start of the data region - **/ -static inline physical_block_number_t __must_check -vdo_get_data_region_start(struct volume_geometry geometry) -{ - return geometry.regions[DATA_REGION].start_block; -} - -/** - * Get the size of the index region from a geometry. - * - * @param geometry The geometry - * - * @return the size of the index region - **/ -static inline physical_block_number_t __must_check -vdo_get_index_region_size(struct volume_geometry geometry) -{ - return vdo_get_data_region_start(geometry) - - vdo_get_index_region_start(geometry); -} - -/** - * Synchronously read a geometry block from a block device. - * - * @param bdev The block device containing the block to read - * @param geometry A volume_geometry to read into - * - * @return VDO_SUCCESS or an error code - **/ -int __must_check -vdo_read_geometry_block(struct block_device *bdev, - struct volume_geometry *geometry); - -/** - * Convert an index config to a UDS configuration, which can be used by UDS. - * - * @param index_config The index config to convert - * @param uds_config_ptr A pointer to return the UDS configuration - * - * @return VDO_SUCCESS or an error - **/ -int __must_check -vdo_index_config_to_uds_configuration(const struct index_config *index_config, - struct uds_configuration **uds_config_ptr); - -/** - * Modify the uds_parameters to match the requested index config. - * - * @param index_config The index config to convert - * @param user_params The uds_parameters to modify - **/ -void vdo_index_config_to_uds_parameters(const struct index_config *index_config, - struct uds_parameters *user_params); - -#endif // VOLUME_GEOMETRY_H diff --git a/vdo/wait-queue.c b/vdo/wait-queue.c new file mode 100644 index 00000000..3aab243b --- /dev/null +++ b/vdo/wait-queue.c @@ -0,0 +1,263 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright Red Hat + */ + +#include "wait-queue.h" + +#include "permassert.h" + +#include "status-codes.h" + +/** + * enqueue_waiter() - Add a waiter to the tail end of a wait queue. + * @queue: The queue to which to add the waiter. + * @waiter: The waiter to add to the queue. + * + * The waiter must not already be waiting in a queue. + * + * Return: VDO_SUCCESS or an error code. + */ +int enqueue_waiter(struct wait_queue *queue, struct waiter *waiter) +{ + int result = ASSERT((waiter->next_waiter == NULL), + "new waiter must not already be in a waiter queue"); + if (result != VDO_SUCCESS) { + return result; + } + + if (queue->last_waiter == NULL) { + /* + * The queue is empty, so form the initial circular list by + * self-linking the initial waiter. + */ + waiter->next_waiter = waiter; + } else { + /* Splice the new waiter in at the end of the queue. */ + waiter->next_waiter = queue->last_waiter->next_waiter; + queue->last_waiter->next_waiter = waiter; + } + /* + * In both cases, the waiter we added to the ring becomes the last + * waiter. + */ + queue->last_waiter = waiter; + queue->queue_length += 1; + return VDO_SUCCESS; +} + +/** + * transfer_all_waiters() - Transfer all waiters from one wait queue to a + * second queue, emptying the first queue. + * @from_queue: The queue containing the waiters to move. + * @to_queue: The queue that will receive the waiters from the first queue. + */ +void transfer_all_waiters(struct wait_queue *from_queue, + struct wait_queue *to_queue) +{ + /* If the source queue is empty, there's nothing to do. */ + if (!has_waiters(from_queue)) { + return; + } + + if (has_waiters(to_queue)) { + /* + * Both queues are non-empty. Splice the two circular lists + * together by swapping the next (head) pointers in the list + * tails. + */ + struct waiter *from_head = from_queue->last_waiter->next_waiter; + struct waiter *to_head = to_queue->last_waiter->next_waiter; + + to_queue->last_waiter->next_waiter = from_head; + from_queue->last_waiter->next_waiter = to_head; + } + + to_queue->last_waiter = from_queue->last_waiter; + to_queue->queue_length += from_queue->queue_length; + initialize_wait_queue(from_queue); +} + +/** + * notify_all_waiters() - Notify all the entries waiting in a queue. + * @queue: The wait queue containing the waiters to notify. + * @callback: The function to call to notify each waiter, or NULL to invoke + * the callback field registered in each waiter. + * @context: The context to pass to the callback function. + * + * Notifies all the entries waiting in a queue to continue execution by + * invoking a callback function on each of them in turn. The queue is copied + * and emptied before invoking any callbacks, and only the waiters that were + * in the queue at the start of the call will be notified. + */ +void notify_all_waiters(struct wait_queue *queue, waiter_callback *callback, + void *context) +{ + /* + * Copy and empty the queue first, avoiding the possibility of an + * infinite loop if entries are returned to the queue by the callback + * function. + */ + struct wait_queue waiters; + + initialize_wait_queue(&waiters); + transfer_all_waiters(queue, &waiters); + + /* Drain the copied queue, invoking the callback on every entry. */ + while (notify_next_waiter(&waiters, callback, context)) { + /* All the work is done by the loop condition. */ + } +} + +/** + * get_first_waiter() - Return the waiter that is at the head end of a wait + * queue. + * @queue: The queue from which to get the first waiter. + * + * Return: The first (oldest) waiter in the queue, or NULL if the queue is + * empty. + */ +struct waiter *get_first_waiter(const struct wait_queue *queue) +{ + struct waiter *last_waiter = queue->last_waiter; + + if (last_waiter == NULL) { + /* There are no waiters, so we're done. */ + return NULL; + } + + /* + * The queue is circular, so the last entry links to the head of the + * queue. + */ + return last_waiter->next_waiter; +} + +/** + * dequeue_matching_waiters() - Remove all waiters that match based on the + * specified matching method and append them to a + * wait_queue. + * @queue: The wait queue to process. + * @match_method: The method to determine matching. + * @match_context: Contextual info for the match method. + * @matched_queue: A wait_queue to store matches. + * + * Return: VDO_SUCCESS or an error code. + */ +int dequeue_matching_waiters(struct wait_queue *queue, + waiter_match *match_method, + void *match_context, + struct wait_queue *matched_queue) +{ + struct wait_queue matched_waiters, iteration_queue; + + initialize_wait_queue(&matched_waiters); + + initialize_wait_queue(&iteration_queue); + transfer_all_waiters(queue, &iteration_queue); + while (has_waiters(&iteration_queue)) { + struct waiter *waiter = dequeue_next_waiter(&iteration_queue); + int result = VDO_SUCCESS; + + if (!match_method(waiter, match_context)) { + result = enqueue_waiter(queue, waiter); + } else { + result = enqueue_waiter(&matched_waiters, waiter); + } + if (result != VDO_SUCCESS) { + transfer_all_waiters(&matched_waiters, queue); + transfer_all_waiters(&iteration_queue, queue); + return result; + } + } + + transfer_all_waiters(&matched_waiters, matched_queue); + return VDO_SUCCESS; +} + +/** + * dequeue_next_waiter() - Remove the first waiter from the head end of a wait + * queue. + * @queue: The wait queue from which to remove the first entry. + * + * The caller will be responsible for waking the waiter by invoking the + * correct callback function to resume its execution. + * + * Return: The first (oldest) waiter in the queue, or NULL if the queue is + * empty. + */ +struct waiter *dequeue_next_waiter(struct wait_queue *queue) +{ + struct waiter *first_waiter = get_first_waiter(queue); + struct waiter *last_waiter = queue->last_waiter; + + if (first_waiter == NULL) { + return NULL; + } + + if (first_waiter == last_waiter) { + /* + * The queue has a single entry, so just empty it out by nulling + * the tail. + */ + queue->last_waiter = NULL; + } else { + /* + * The queue has more than one entry, so splice the first waiter + * out of the circular queue. + */ + last_waiter->next_waiter = first_waiter->next_waiter; + } + + /* The waiter is no longer in a wait queue. */ + first_waiter->next_waiter = NULL; + queue->queue_length -= 1; + return first_waiter; +} + +/** + * notify_next_waiter() - Notify the next entry waiting in a queue. + * @queue: The wait queue containing the waiter to notify. + * @callback: The function to call to notify the waiter, or NULL to invoke the + * callback field registered in the waiter. + * @context: The context to pass to the callback function. + * + * Notifies the next entry waiting in a queue to continue execution by + * invoking a callback function on it after removing it from the queue. + * + * Return: true if there was a waiter in the queue. + */ +bool notify_next_waiter(struct wait_queue *queue, waiter_callback *callback, + void *context) +{ + struct waiter *waiter = dequeue_next_waiter(queue); + + if (waiter == NULL) { + return false; + } + + if (callback == NULL) { + callback = waiter->callback; + } + (*callback)(waiter, context); + return true; +} + +/** + * get_next_waiter() - Get the waiter after this one, for debug iteration. + * @queue: The wait queue. + * @waiter: A waiter. + * + * Return: The next waiter, or NULL. + */ +const struct waiter *get_next_waiter(const struct wait_queue *queue, + const struct waiter *waiter) +{ + struct waiter *first_waiter = get_first_waiter(queue); + + if (waiter == NULL) { + return first_waiter; + } + return ((waiter->next_waiter != first_waiter) ? waiter->next_waiter + : NULL); +} diff --git a/vdo/wait-queue.h b/vdo/wait-queue.h new file mode 100644 index 00000000..a3a394cb --- /dev/null +++ b/vdo/wait-queue.h @@ -0,0 +1,139 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright Red Hat + */ + +#ifndef WAIT_QUEUE_H +#define WAIT_QUEUE_H + +#include "compiler.h" +#include "type-defs.h" + +/** + * DOC: Wait queues. + * + * A wait queue is a circular list of entries waiting to be notified of a + * change in a condition. Keeping a circular list allows the queue structure + * to simply be a pointer to the tail (newest) entry in the queue, supporting + * constant-time enqueue and dequeue operations. A null pointer is an empty + * queue. + * + * An empty queue: + * queue0.last_waiter -> NULL + * + * A singleton queue: + * queue1.last_waiter -> entry1 -> entry1 -> [...] + * + * A three-element queue: + * queue2.last_waiter -> entry3 -> entry1 -> entry2 -> entry3 -> [...] + */ + +struct waiter; + +struct wait_queue { + /* The tail of the queue, the last (most recently added) entry */ + struct waiter *last_waiter; + /* The number of waiters currently in the queue */ + size_t queue_length; +}; + +/** + * typedef waiter_callback - Callback type for functions which will be called + * to resume processing of a waiter after it has + * been removed from its wait queue. + */ +typedef void waiter_callback(struct waiter *waiter, void *context); + +/** + * typedef waiter_match - Method type for waiter matching methods. + * + * A waiter_match method returns false if the waiter does not match. + */ +typedef bool waiter_match(struct waiter *waiter, void *context); + +/* + * The queue entry structure for entries in a wait_queue. + */ +struct waiter { + /* + * The next waiter in the queue. If this entry is the last waiter, then + * this is actually a pointer back to the head of the queue. + */ + struct waiter *next_waiter; + + /* + * Optional waiter-specific callback to invoke when waking this waiter. + */ + waiter_callback *callback; +}; + +/** + * is_waiting() -Check whether a waiter is waiting. + * @waiter: The waiter to check. + * + * Return: true if the waiter is on some wait_queue. + */ +static inline bool is_waiting(struct waiter *waiter) +{ + return (waiter->next_waiter != NULL); +} + +/** + * initialize_wait_queue() - Initialize a wait queue. + * @queue: The queue to initialize. + */ +static inline void initialize_wait_queue(struct wait_queue *queue) +{ + *queue = (struct wait_queue) { + .last_waiter = NULL, + .queue_length = 0, + }; +} + +/** + * has_waiters() - Check whether a wait queue has any entries waiting in it. + * @queue: The queue to query. + * + * Return: true if there are any waiters in the queue. + */ +static inline bool __must_check has_waiters(const struct wait_queue *queue) +{ + return (queue->last_waiter != NULL); +} + +int __must_check +enqueue_waiter(struct wait_queue *queue, struct waiter *waiter); + +void notify_all_waiters(struct wait_queue *queue, waiter_callback *callback, + void *context); + +bool notify_next_waiter(struct wait_queue *queue, waiter_callback *callback, + void *context); + +void transfer_all_waiters(struct wait_queue *from_queue, + struct wait_queue *to_queue); + +struct waiter *get_first_waiter(const struct wait_queue *queue); + +int dequeue_matching_waiters(struct wait_queue *queue, + waiter_match *match_method, + void *match_context, + struct wait_queue *matched_queue); + +struct waiter *dequeue_next_waiter(struct wait_queue *queue); + +/** + * count_waiters() - Count the number of waiters in a wait queue. + * @queue: The wait queue to query. + * + * Return: The number of waiters in the queue. + */ +static inline size_t __must_check count_waiters(const struct wait_queue *queue) +{ + return queue->queue_length; +} + +const struct waiter * __must_check +get_next_waiter(const struct wait_queue *queue, const struct waiter *waiter); + +#endif /* WAIT_QUEUE_H */ diff --git a/vdo/waitQueue.c b/vdo/waitQueue.c deleted file mode 100644 index a77bd306..00000000 --- a/vdo/waitQueue.c +++ /dev/null @@ -1,189 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/waitQueue.c#7 $ - */ - -#include "waitQueue.h" - -#include "permassert.h" - -#include "statusCodes.h" - -/**********************************************************************/ -int enqueue_waiter(struct wait_queue *queue, struct waiter *waiter) -{ - int result = ASSERT((waiter->next_waiter == NULL), - "new waiter must not already be in a waiter queue"); - if (result != VDO_SUCCESS) { - return result; - } - - if (queue->last_waiter == NULL) { - // The queue is empty, so form the initial circular list by - // self-linking the initial waiter. - waiter->next_waiter = waiter; - } else { - // Splice the new waiter in at the end of the queue. - waiter->next_waiter = queue->last_waiter->next_waiter; - queue->last_waiter->next_waiter = waiter; - } - // In both cases, the waiter we added to the ring becomes the last - // waiter. - queue->last_waiter = waiter; - queue->queue_length += 1; - return VDO_SUCCESS; -} - -/**********************************************************************/ -void transfer_all_waiters(struct wait_queue *from_queue, - struct wait_queue *to_queue) -{ - // If the source queue is empty, there's nothing to do. - if (!has_waiters(from_queue)) { - return; - } - - if (has_waiters(to_queue)) { - // Both queues are non-empty. Splice the two circular lists - // together by swapping the next (head) pointers in the list - // tails. - struct waiter *from_head = from_queue->last_waiter->next_waiter; - struct waiter *to_head = to_queue->last_waiter->next_waiter; - to_queue->last_waiter->next_waiter = from_head; - from_queue->last_waiter->next_waiter = to_head; - } - - to_queue->last_waiter = from_queue->last_waiter; - to_queue->queue_length += from_queue->queue_length; - initialize_wait_queue(from_queue); -} - -/**********************************************************************/ -void notify_all_waiters(struct wait_queue *queue, waiter_callback *callback, - void *context) -{ - // Copy and empty the queue first, avoiding the possibility of an - // infinite loop if entries are returned to the queue by the callback - // function. - struct wait_queue waiters; - initialize_wait_queue(&waiters); - transfer_all_waiters(queue, &waiters); - - // Drain the copied queue, invoking the callback on every entry. - while (notify_next_waiter(&waiters, callback, context)) { - // All the work is done by the loop condition. - } -} - -/**********************************************************************/ -struct waiter *get_first_waiter(const struct wait_queue *queue) -{ - struct waiter *last_waiter = queue->last_waiter; - if (last_waiter == NULL) { - // There are no waiters, so we're done. - return NULL; - } - - // The queue is circular, so the last entry links to the head of the - // queue. - return last_waiter->next_waiter; -} - -/**********************************************************************/ -int dequeue_matching_waiters(struct wait_queue *queue, - waiter_match *match_method, - void *match_context, - struct wait_queue *matched_queue) -{ - struct wait_queue matched_waiters, iteration_queue; - initialize_wait_queue(&matched_waiters); - - initialize_wait_queue(&iteration_queue); - transfer_all_waiters(queue, &iteration_queue); - while (has_waiters(&iteration_queue)) { - struct waiter *waiter = dequeue_next_waiter(&iteration_queue); - int result = VDO_SUCCESS; - if (!match_method(waiter, match_context)) { - result = enqueue_waiter(queue, waiter); - } else { - result = enqueue_waiter(&matched_waiters, waiter); - } - if (result != VDO_SUCCESS) { - transfer_all_waiters(&matched_waiters, queue); - transfer_all_waiters(&iteration_queue, queue); - return result; - } - } - - transfer_all_waiters(&matched_waiters, matched_queue); - return VDO_SUCCESS; -} - -/**********************************************************************/ -struct waiter *dequeue_next_waiter(struct wait_queue *queue) -{ - struct waiter *first_waiter = get_first_waiter(queue); - struct waiter *last_waiter = queue->last_waiter; - if (first_waiter == NULL) { - return NULL; - } - - if (first_waiter == last_waiter) { - // The queue has a single entry, so just empty it out by nulling - // the tail. - queue->last_waiter = NULL; - } else { - // The queue has more than one entry, so splice the first waiter - // out of the circular queue. - last_waiter->next_waiter = first_waiter->next_waiter; - } - - // The waiter is no longer in a wait queue. - first_waiter->next_waiter = NULL; - queue->queue_length -= 1; - return first_waiter; -} - -/**********************************************************************/ -bool notify_next_waiter(struct wait_queue *queue, waiter_callback *callback, - void *context) -{ - struct waiter *waiter = dequeue_next_waiter(queue); - if (waiter == NULL) { - return false; - } - - if (callback == NULL) { - callback = waiter->callback; - } - (*callback)(waiter, context); - return true; -} - -/**********************************************************************/ -const struct waiter *get_next_waiter(const struct wait_queue *queue, - const struct waiter *waiter) -{ - struct waiter *first_waiter = get_first_waiter(queue); - if (waiter == NULL) { - return first_waiter; - } - return ((waiter->next_waiter != first_waiter) ? waiter->next_waiter - : NULL); -} diff --git a/vdo/waitQueue.h b/vdo/waitQueue.h deleted file mode 100644 index 12f148ee..00000000 --- a/vdo/waitQueue.h +++ /dev/null @@ -1,231 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/base/waitQueue.h#8 $ - */ - -#ifndef WAIT_QUEUE_H -#define WAIT_QUEUE_H - -#include "common.h" - -/** - * A wait queue is a circular list of entries waiting to be notified of a - * change in a condition. Keeping a circular list allows the queue structure - * to simply be a pointer to the tail (newest) entry in the queue, supporting - * constant-time enqueue and dequeue operations. A null pointer is an empty - * queue. - * - * An empty queue: - * queue0.last_waiter -> NULL - * - * A singleton queue: - * queue1.last_waiter -> entry1 -> entry1 -> [...] - * - * A three-element queue: - * queue2.last_waiter -> entry3 -> entry1 -> entry2 -> entry3 -> [...] - **/ - -struct waiter; - -struct wait_queue { - /** The tail of the queue, the last (most recently added) entry */ - struct waiter *last_waiter; - /** The number of waiters currently in the queue */ - size_t queue_length; -}; - -/** - * Callback type for functions which will be called to resume processing of a - * waiter after it has been removed from its wait queue. - **/ -typedef void waiter_callback(struct waiter *waiter, void *context); - -/** - * Method type for waiter matching methods. - * - * A waiter_match method returns false if the waiter does not match. - **/ -typedef bool waiter_match(struct waiter *waiter, void *context); - -/** - * The queue entry structure for entries in a wait_queue. - **/ -struct waiter { - /** - * The next waiter in the queue. If this entry is the last waiter, then - * this is actually a pointer back to the head of the queue. - **/ - struct waiter *next_waiter; - - /** - * Optional waiter-specific callback to invoke when waking this waiter. - */ - waiter_callback *callback; -}; - -/** - * Check whether a waiter is waiting. - * - * @param waiter The waiter to check - * - * @return true if the waiter is on some wait_queue - **/ -static inline bool is_waiting(struct waiter *waiter) -{ - return (waiter->next_waiter != NULL); -} - -/** - * Initialize a wait queue. - * - * @param queue The queue to initialize - **/ -static inline void initialize_wait_queue(struct wait_queue *queue) -{ - *queue = (struct wait_queue) { - .last_waiter = NULL, - .queue_length = 0, - }; -} - -/** - * Check whether a wait queue has any entries waiting in it. - * - * @param queue The queue to query - * - * @return true if there are any waiters in the queue - **/ -static inline bool __must_check has_waiters(const struct wait_queue *queue) -{ - return (queue->last_waiter != NULL); -} - -/** - * Add a waiter to the tail end of a wait queue. The waiter must not already - * be waiting in a queue. - * - * @param queue The queue to which to add the waiter - * @param waiter The waiter to add to the queue - * - * @return VDO_SUCCESS or an error code - **/ -int __must_check -enqueue_waiter(struct wait_queue *queue, struct waiter *waiter); - -/** - * Notify all the entries waiting in a queue to continue execution by invoking - * a callback function on each of them in turn. The queue is copied and - * emptied before invoking any callbacks, and only the waiters that were in - * the queue at the start of the call will be notified. - * - * @param queue The wait queue containing the waiters to notify - * @param callback The function to call to notify each waiter, or NULL - * to invoke the callback field registered in each waiter - * @param context The context to pass to the callback function - **/ -void notify_all_waiters(struct wait_queue *queue, waiter_callback *callback, - void *context); - -/** - * Notify the next entry waiting in a queue to continue execution by invoking - * a callback function on it after removing it from the queue. - * - * @param queue The wait queue containing the waiter to notify - * @param callback The function to call to notify the waiter, or NULL - * to invoke the callback field registered in the waiter - * @param context The context to pass to the callback function - * - * @return true if there was a waiter in the queue - **/ -bool notify_next_waiter(struct wait_queue *queue, waiter_callback *callback, - void *context); - -/** - * Transfer all waiters from one wait queue to a second queue, emptying the - * first queue. - * - * @param from_queue The queue containing the waiters to move - * @param to_queue The queue that will receive the waiters from the - * the first queue - **/ -void transfer_all_waiters(struct wait_queue *from_queue, - struct wait_queue *to_queue); - -/** - * Return the waiter that is at the head end of a wait queue. - * - * @param queue The queue from which to get the first waiter - * - * @return The first (oldest) waiter in the queue, or NULL if - * the queue is empty - **/ -struct waiter *get_first_waiter(const struct wait_queue *queue); - -/** - * Remove all waiters that match based on the specified matching method and - * append them to a wait_queue. - * - * @param queue The wait queue to process - * @param match_method The method to determine matching - * @param match_context Contextual info for the match method - * @param matched_queue A wait_queue to store matches - * - * @return VDO_SUCCESS or an error code - **/ -int dequeue_matching_waiters(struct wait_queue *queue, - waiter_match *match_method, - void *match_context, - struct wait_queue *matched_queue); - -/** - * Remove the first waiter from the head end of a wait queue. The caller will - * be responsible for waking the waiter by invoking the correct callback - * function to resume its execution. - * - * @param queue The wait queue from which to remove the first entry - * - * @return The first (oldest) waiter in the queue, or NULL if - * the queue is empty - **/ -struct waiter *dequeue_next_waiter(struct wait_queue *queue); - -/** - * Count the number of waiters in a wait queue. - * - * @param queue The wait queue to query - * - * @return the number of waiters in the queue - **/ -static inline size_t __must_check count_waiters(const struct wait_queue *queue) -{ - return queue->queue_length; -} - -/** - * Get the waiter after this one, for debug iteration. - * - * @param queue The wait queue - * @param waiter A waiter - * - * @return the next waiter, or NULL - **/ -const struct waiter * __must_check -get_next_waiter(const struct wait_queue *queue, const struct waiter *waiter); - -#endif // WAIT_QUEUE_H diff --git a/vdo/workItemStats.c b/vdo/workItemStats.c deleted file mode 100644 index 50b63285..00000000 --- a/vdo/workItemStats.c +++ /dev/null @@ -1,449 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/kernel/workItemStats.c#7 $ - */ - -#include "workItemStats.h" - -#include -#include "logger.h" - -/**********************************************************************/ -void initialize_vdo_work_item_stats(struct vdo_work_item_stats *stats) -{ - spin_lock_init(&stats->function_table.lock); - - if (VDO_ENABLE_PER_FUNCTION_TIMING_STATS) { - int i; - for (i = 0; i < NUM_VDO_WORK_QUEUE_ITEM_STATS + 1; i++) { - initialize_vdo_simple_stats(&stats->times[i]); - } - } -} - -/**********************************************************************/ -uint64_t count_vdo_work_items_processed(const struct vdo_work_item_stats *stats) -{ - uint64_t total_processed = 0; - int i; - - for (i = 0; i < NUM_VDO_WORK_QUEUE_ITEM_STATS + 1; i++) { - total_processed += READ_ONCE(stats->times[i].count); - } - - return total_processed; -} - -/**********************************************************************/ -unsigned int count_vdo_work_items_pending(const struct vdo_work_item_stats *stats) -{ - long long pending = 0; - int i; - - for (i = 0; i < NUM_VDO_WORK_QUEUE_ITEM_STATS + 1; i++) { - pending += atomic64_read(&stats->enqueued[i]); - pending -= READ_ONCE(stats->times[i].count); - } - - // If we fetched numbers that were changing, we can get negative - // results. Returning one is an indication that there's some activity. - return (pending < 0) ? 1 : pending; -} - -/** - * Scan the work queue stats table for the provided work function and - * priority value. If it's not found, see if an empty slot is - * available. - * - * @param table The work queue's function table - * @param work The function we want to record stats for - * @param priority The priority of the work item - * - * @return The index of the slot to use (matching or empty), or - * NUM_VDO_WORK_QUEUE_ITEM_STATS if the table is full of - * non-matching entries. - **/ -static inline unsigned int -scan_stat_table(const struct vdo_work_function_table *table, - vdo_work_function work, - unsigned int priority) -{ - unsigned int i; - /* - * See comments in get_stat_table_index regarding order of memory - * accesses. Work function first, then a barrier, then priority. - */ - for (i = 0; i < NUM_VDO_WORK_QUEUE_ITEM_STATS; i++) { - if (table->functions[i] == NULL) { - return i; - } else if (table->functions[i] == work) { - smp_rmb(); - if (table->priorities[i] == priority) { - return i; - } - } - } - return NUM_VDO_WORK_QUEUE_ITEM_STATS; -} - -/** - * Scan the work queue stats table for the provided work function and - * priority value. Assign an empty slot if necessary. - * - * @param stats The stats structure - * @param work The function we want to record stats for - * @param priority The priority of the work item - * - * @return The index of the matching slot, or NUM_VDO_WORK_QUEUE_ITEM_STATS - * if the table is full of non-matching entries. - **/ -static unsigned int get_stat_table_index(struct vdo_work_item_stats *stats, - vdo_work_function work, - unsigned int priority) -{ - struct vdo_work_function_table *function_table = - &stats->function_table; - unsigned int index = scan_stat_table(function_table, work, priority); - - unsigned long flags = 0; - - if (unlikely(index == NUM_VDO_WORK_QUEUE_ITEM_STATS) || - likely(function_table->functions[index] != NULL)) { - return index; - } - - spin_lock_irqsave(&function_table->lock, flags); - // Recheck now that we've got the lock... - index = scan_stat_table(function_table, work, priority); - if ((index == NUM_VDO_WORK_QUEUE_ITEM_STATS) || - (function_table->functions[index] != NULL)) { - spin_unlock_irqrestore(&function_table->lock, flags); - return index; - } - - /* - * An uninitialized priority is indistinguishable from a zero - * priority. So store the priority first, and enforce the ordering, - * so that a non-null work function pointer indicates we've finished - * filling in the value. (And, to make this work, we have to read - * the work function first and priority second, when comparing.) - */ - function_table->priorities[index] = priority; - smp_wmb(); - function_table->functions[index] = work; - spin_unlock_irqrestore(&function_table->lock, flags); - return index; -} - -/** - * Get counters on work items, identified by index into the internal - * array. - * - * @param [in] stats The collected statistics - * @param [in] index The index - * @param [out] enqueued_ptr The total work items enqueued - * @param [out] processed_ptr The number of work items processed - * @param [out] pending_ptr The number of work items still pending - **/ -static void -get_work_item_counts_by_item(const struct vdo_work_item_stats *stats, - unsigned int index, - uint64_t *enqueued_ptr, - uint64_t *processed_ptr, - unsigned int *pending_ptr) -{ - uint64_t enqueued = atomic64_read(&stats->enqueued[index]); - uint64_t processed = READ_ONCE(stats->times[index].count); - unsigned int pending; - - if (enqueued < processed) { - // Probably just out of sync. - pending = 1; - } else { - pending = enqueued - processed; - // Pedantic paranoia: Check for overflow of the 32-bit - // "pending". - if ((pending + processed) < enqueued) { - pending = UINT_MAX; - } - } - *enqueued_ptr = enqueued; - *processed_ptr = processed; - *pending_ptr = pending; -} - -/** - * Get counters on work items not covered by any index value. - * - * @param [in] stats The collected statistics - * @param [out] enqueued_ptr The total work items enqueued - * @param [out] processed_ptr The number of work items processed - **/ -static void get_other_work_item_counts(const struct vdo_work_item_stats *stats, - uint64_t *enqueued_ptr, - uint64_t *processed_ptr) -{ - unsigned int pending; - - get_work_item_counts_by_item(stats, - NUM_VDO_WORK_QUEUE_ITEM_STATS, - enqueued_ptr, - processed_ptr, - &pending); -} - -/** - * Get timing summary stats on work items. - * - * @param [in] stats The collected statistics - * @param [out] min The minimum execution time - * @param [out] mean The mean execution time - * @param [out] max The maximum execution time - **/ -static void -summarize_work_item_times(const struct simple_stats *stats, - uint64_t *min, - uint64_t *mean, - uint64_t *max) -{ - uint64_t sum = READ_ONCE(stats->sum); - uint64_t count = READ_ONCE(stats->count); - uint64_t slop = count / 2; - uint64_t sample_average = (sum + slop) / count; - - *min = READ_ONCE(stats->min); - *mean = sample_average; - *max = READ_ONCE(stats->max); -} - -/**********************************************************************/ -void update_vdo_work_item_stats_for_enqueue(struct vdo_work_item_stats *stats, - struct vdo_work_item *item, - int priority) -{ - item->stat_table_index = get_stat_table_index(stats, - item->stats_function, - priority); - atomic64_inc(&stats->enqueued[item->stat_table_index]); -} - -/**********************************************************************/ -void vdo_get_function_name(void *pointer, char *buffer, size_t buffer_length) -{ - if (pointer == NULL) { - /* - * Format "%ps" logs a null pointer as "(null)" with a bunch of - * leading spaces. We sometimes use this when logging lots of - * data; don't be so verbose. - */ - strncpy(buffer, "-", buffer_length); - } else { - /* - * Use a non-const array instead of a string literal below to - * defeat gcc's format checking, which doesn't understand that - * "%ps" actually does support a precision spec in Linux kernel - * code. - */ - static char truncated_function_name_format_string[] = "%.*ps"; - char *space; - - snprintf(buffer, - buffer_length, - truncated_function_name_format_string, - buffer_length - 1, - pointer); - - space = strchr(buffer, ' '); - - if (space != NULL) { - *space = '\0'; - } - } -} - -/**********************************************************************/ -size_t format_vdo_work_item_stats(const struct vdo_work_item_stats *stats, - char *buffer, - size_t length) -{ - const struct vdo_work_function_table *function_ids = - &stats->function_table; - size_t current_offset = 0; - int i; - - for (i = 0; i < NUM_VDO_WORK_QUEUE_ITEM_STATS; i++) { - uint64_t enqueued, processed; - unsigned int pending; - - if (function_ids->functions[i] == NULL) { - break; - } - if (atomic64_read(&stats->enqueued[i]) == 0) { - continue; - } - /* - * The reporting of all of "pending", "enqueued" and - * "processed" here seems redundant, but "pending" is limited - * to zero in the case where "processed" exceeds "enqueued", - * either through current activity and a lack of - * synchronization when fetching stats, or a coding bug. This - * report is intended largely for debugging, so we'll go ahead - * and print the not-necessarily-redundant values. - */ - - get_work_item_counts_by_item(stats, - i, - &enqueued, - &processed, - &pending); - - // Format: fn prio enq proc timeo [ min max mean ] - if (VDO_ENABLE_PER_FUNCTION_TIMING_STATS) { - uint64_t min, mean, max; - - summarize_work_item_times(&stats->times[i], - &min, - &mean, - &max); - current_offset += - scnprintf(buffer + current_offset, - length - current_offset, - "%-36ps %d %10llu %10llu %10llu %10llu %10llu\n", - function_ids->functions[i], - function_ids->priorities[i], - enqueued, - processed, - min, - max, - mean); - } else { - current_offset += - scnprintf(buffer + current_offset, - length - current_offset, - "%-36ps %d %10llu %10llu\n", - function_ids->functions[i], - function_ids->priorities[i], - enqueued, - processed); - } - if (current_offset >= length) { - break; - } - } - if ((i == NUM_VDO_WORK_QUEUE_ITEM_STATS) && (current_offset < length)) { - uint64_t enqueued, processed; - - get_other_work_item_counts(stats, &enqueued, &processed); - if (enqueued > 0) { - current_offset += scnprintf(buffer + current_offset, - length - current_offset, - "%-36s %d %10llu %10llu\n", - "OTHER", - 0, - enqueued, - processed); - } - } - return current_offset; -} - -/**********************************************************************/ -void log_vdo_work_item_stats(const struct vdo_work_item_stats *stats) -{ - uint64_t total_enqueued = 0; - uint64_t total_processed = 0; - - const struct vdo_work_function_table *function_ids = - &stats->function_table; - - int i; - - for (i = 0; i < NUM_VDO_WORK_QUEUE_ITEM_STATS; i++) { - uint64_t enqueued, processed; - unsigned int pending; - char function_name[96]; // arbitrary size - - if (function_ids->functions[i] == NULL) { - break; - } - if (atomic64_read(&stats->enqueued[i]) == 0) { - continue; - } - /* - * The reporting of all of "pending", "enqueued" and - * "processed" here seems redundant, but "pending" is limited - * to zero in the case where "processed" exceeds "enqueued", - * either through current activity and a lack of - * synchronization when fetching stats, or a coding bug. This - * report is intended largely for debugging, so we'll go ahead - * and print the not-necessarily-redundant values. - */ - get_work_item_counts_by_item(stats, - i, - &enqueued, - &processed, - &pending); - total_enqueued += enqueued; - total_processed += processed; - - vdo_get_function_name(function_ids->functions[i], - function_name, - sizeof(function_name)); - - if (VDO_ENABLE_PER_FUNCTION_TIMING_STATS) { - uint64_t min, mean, max; - - summarize_work_item_times(&stats->times[i], - &min, - &mean, - &max); - uds_log_info(" priority %d: %u pending %llu enqueued %llu processed %s times %llu/%llu/%lluns", - function_ids->priorities[i], - pending, - enqueued, - processed, - function_name, - min, - mean, - max); - } else { - uds_log_info(" priority %d: %u pending %llu enqueued %llu processed %s", - function_ids->priorities[i], - pending, - enqueued, - processed, - function_name); - } - } - if (i == NUM_VDO_WORK_QUEUE_ITEM_STATS) { - uint64_t enqueued, processed; - - get_other_work_item_counts(stats, &enqueued, &processed); - if (enqueued > 0) { - total_enqueued += enqueued; - total_processed += processed; - uds_log_info(" ... others: %llu enqueued %llu processed", - enqueued, - processed); - } - } - uds_log_info(" total: %llu enqueued %llu processed", - total_enqueued, - total_processed); -} diff --git a/vdo/workItemStats.h b/vdo/workItemStats.h deleted file mode 100644 index d1f01e22..00000000 --- a/vdo/workItemStats.h +++ /dev/null @@ -1,246 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/kernel/workItemStats.h#4 $ - */ - -#ifndef WORK_ITEM_STATS_H -#define WORK_ITEM_STATS_H - -#include "timeUtils.h" - -#include "workQueue.h" - -enum { - // Whether to enable tracking of per-work-function run-time stats. - VDO_ENABLE_PER_FUNCTION_TIMING_STATS = 0, - // How many work function/priority pairs to track call stats for - NUM_VDO_WORK_QUEUE_ITEM_STATS = 18, -}; - -struct simple_stats { - uint64_t count; - uint64_t sum; - uint64_t min; - uint64_t max; -}; - -/* - * We track numbers of work items handled (and optionally the - * wall-clock time to run the work functions), broken down by - * individual work functions (or alternate functions that the caller - * wants recorded, like the VIO completion callback function if we're - * just enqueueing a work function that invokes that indirectly) and - * priority. - * - * The first part of this structure manages the function/priority - * pairs, and is read frequently but updated rarely (once for each - * pair, plus possibly spin lock contention). - * - * The second part holds counters, and is updated often; different - * parts are updated by various threads as described below. The last - * element of each array, index NUM_VDO_WORK_QUEUE_ITEM_STATS, is updated - * only if we have filled the arrays and can't add the current work - * function/priority. See how the stat_table_index field is set in - * workItemStats.c. - * - * All fields may additionally be read when reporting statistics - * (including optionally reporting stats when the worker thread shuts - * down), but that's rare and shouldn't significantly affect cache - * contention issues. - * - * There is no "pending" count per work function here. For reporting - * statistics, it can be approximated by looking at the other fields. - * Do not rely on them being precise and synchronized, though. - */ -struct vdo_work_function_table { - /* - * The spin lock is used to protect .functions and .priorities - * during updates. All three are modified by producers (enqueueing - * threads) but only rarely. The .functions and .priorities arrays - * are read by producers very frequently. - */ - spinlock_t lock; - vdo_work_function functions[NUM_VDO_WORK_QUEUE_ITEM_STATS]; - uint8_t priorities[NUM_VDO_WORK_QUEUE_ITEM_STATS]; -}; - -struct vdo_work_item_stats { - /* - * Table of functions and priorities, for determining the index to - * use into the counter arrays below. - * - * This table is read by producers (usually multiple entries) for - * every work item enqueued, and when reporting stats. It is updated - * by producers, and only the first time a new (work-function, - * priority) combination is seen. - */ - struct vdo_work_function_table function_table; - // Skip to (somewhere on) the next cache line - char pad[CACHE_LINE_BYTES - sizeof(atomic64_t)]; - /* - * The .enqueued field is updated by producers only, once per work - * item processed; __sync operations are used to update these - * values. - */ - atomic64_t enqueued[NUM_VDO_WORK_QUEUE_ITEM_STATS + 1]; - // Skip to (somewhere on) the next cache line - char pad2[CACHE_LINE_BYTES - sizeof(atomic64_t)]; - /* - * These values are updated only by the consumer (worker thread). We - * overload the .times[].count field as a count of items processed, - * so if we're not doing the optional processing-time tracking - * (controlled via an option in workQueue.c), we need to explicitly - * update the count. - * - * Since only one thread can ever update these values, no - * synchronization is used. - */ - struct simple_stats times[NUM_VDO_WORK_QUEUE_ITEM_STATS + 1]; -}; - -/** - * Initialize a statistics structure for tracking sample values. Assumes the - * storage was already zeroed out at allocation time. - * - * @param stats The statistics structure - **/ -static inline void initialize_vdo_simple_stats(struct simple_stats *stats) -{ - // Assume other fields are initialized to zero at allocation. - stats->min = UINT64_MAX; -} - -/** - * Update the statistics being tracked for a new sample value. - * - * @param stats The statistics structure - * @param value The new value to be folded in - **/ -static inline void add_vdo_simple_stats_sample(struct simple_stats *stats, - uint64_t value) -{ - stats->count++; - stats->sum += value; - if (stats->min > value) { - stats->min = value; - } - if (stats->max < value) { - stats->max = value; - } -} - -/** - * Initialize a statistics structure for tracking work queue items. Assumes - * the storage was already zeroed out at allocation time. - * - * @param stats The statistics structure - **/ -void initialize_vdo_work_item_stats(struct vdo_work_item_stats *stats); - -/** - * Sum and return the total number of work items that have been processed. - * - * @param stats The statistics structure - * - * @return the total number of work items processed - **/ -uint64_t count_vdo_work_items_processed(const struct vdo_work_item_stats *stats); - -/** - * Compute an approximate indication of the number of pending work items. - * - * No synchronization is used, so it's guaranteed to be correct only if there - * is no activity. - * - * @param stats The statistics structure - * - * @return the estimate of the number of pending work items - **/ -unsigned int count_vdo_work_items_pending(const struct vdo_work_item_stats *stats); - -/** - * Update all work queue statistics (work-item and otherwise) after - * enqueueing a work item. - * - * @param stats The statistics structure - * @param item The work item enqueued - * @param priority The work item's priority - **/ -void update_vdo_work_item_stats_for_enqueue(struct vdo_work_item_stats *stats, - struct vdo_work_item *item, - int priority); - -/** - * Update the work queue statistics with the wall-clock time for - * processing a work item, if timing stats are enabled and if we - * haven't run out of room for recording stats in the table. - * If timing stats aren't enabled, only increments the count of - * items processed. - * - * @param stats The statistics structure - * @param index The work item's index into the internal array - * @param start_time The time when the item was dequeued for processing - **/ -static inline void -update_vdo_work_item_stats_for_work_time(struct vdo_work_item_stats *stats, - unsigned int index, - uint64_t start_time) -{ - if (VDO_ENABLE_PER_FUNCTION_TIMING_STATS) { - add_vdo_simple_stats_sample(&stats->times[index], - ktime_get_ns() - start_time); - } else { - // The times[].count field is used as a count of items - // processed even when functions aren't being timed. - stats->times[index].count++; - } -} - -/** - * Convert the pointer into a string representation, using a function - * name if available. - * - * @param pointer The pointer to be converted - * @param buffer The output buffer - * @param buffer_length The size of the output buffer - **/ -void vdo_get_function_name(void *pointer, char *buffer, size_t buffer_length); - -/** - * Dump statistics broken down by work function and priority into the - * kernel log. - * - * @param stats The statistics structure - **/ -void log_vdo_work_item_stats(const struct vdo_work_item_stats *stats); - -/** - * Format counters for per-work-function stats for reporting via /sys. - * - * @param [in] stats The statistics structure - * @param [out] buffer The output buffer - * @param [in] length The size of the output buffer - * - * @return The size of the string actually written - **/ -size_t format_vdo_work_item_stats(const struct vdo_work_item_stats *stats, - char *buffer, - size_t length); - -#endif // WORK_ITEM_STATS_H diff --git a/vdo/workQueue.c b/vdo/workQueue.c index 0fee1c26..c824de1c 100644 --- a/vdo/workQueue.c +++ b/vdo/workQueue.c @@ -1,22 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/kernel/workQueue.c#30 $ */ #include "workQueue.h" @@ -26,133 +10,178 @@ #include #include "logger.h" -#include "memoryAlloc.h" +#include "memory-alloc.h" #include "numeric.h" #include "permassert.h" -#include "stringUtils.h" +#include "string-utils.h" -#include "statusCodes.h" - -#include "workItemStats.h" -#include "workQueueInternals.h" -#include "workQueueStats.h" -#include "workQueueSysfs.h" +#include "completion.h" +#include "status-codes.h" static DEFINE_PER_CPU(unsigned int, service_queue_rotor); -static void free_simple_work_queue(struct simple_work_queue *queue); -static void finish_simple_work_queue(struct simple_work_queue *queue); - -// Finding the simple_work_queue to actually operate on. - /** - * Pick the subordinate service queue to use, distributing the work evenly - * across them. - * - * @param queue The round-robin-type work queue + * DOC: Work queue definition. * - * @return A subordinate work queue - **/ -static inline struct simple_work_queue * -next_service_queue(struct round_robin_work_queue *queue) -{ + * There are two types of work queues: simple, with one worker thread, and + * round-robin, which uses a group of the former to do the work, and assigns + * work to them in round-robin fashion (roughly). Externally, both are + * represented via the same common sub-structure, though there's actually not a + * great deal of overlap between the two types internally. + */ +struct vdo_work_queue { + /* Name of just the work queue (e.g., "cpuQ12") */ + char *name; + bool round_robin_mode; + struct vdo_thread *owner; + /* Life cycle functions, etc */ + const struct vdo_work_queue_type *type; +}; + +struct simple_work_queue { + struct vdo_work_queue common; + /* A copy of .thread->pid, for safety in the sysfs support */ + pid_t thread_pid; + /* + * Number of priorities actually used, so we don't keep re-checking + * unused funnel queues. + */ + unsigned int num_priority_lists; + + struct funnel_queue *priority_lists[VDO_WORK_Q_MAX_PRIORITY + 1]; + struct task_struct *thread; + void *private; + /* In a subordinate work queue, a link back to the round-robin parent */ + struct vdo_work_queue *parent_queue; + /* Padding for cache line separation */ + char pad[CACHE_LINE_BYTES - sizeof(struct vdo_work_queue *)]; + /* Lock protecting priority_map, num_priority_lists, started */ + spinlock_t lock; + /* Any (0 or 1) worker threads waiting for new work to do */ + wait_queue_head_t waiting_worker_threads; /* - * It shouldn't be a big deal if the same rotor gets used for multiple - * work queues. Any patterns that might develop are likely to be - * disrupted by random ordering of multiple work items and migration - * between cores, unless the load is so light as to be regular in - * ordering of tasks and the threads are confined to individual cores; - * with a load that light we won't care. + * Hack to reduce wakeup calls if the worker thread is running. See + * comments in workQueue.c. + * + * FIXME: There is a lot of redundancy with "first_wakeup", though, and + * the pair should be re-examined. */ - unsigned int rotor = this_cpu_inc_return(service_queue_rotor); - unsigned int index = rotor % queue->num_service_queues; + atomic_t idle; + /* Wait list for synchronization during worker thread startup */ + wait_queue_head_t start_waiters; + bool started; - return queue->service_queues[index]; -} + /* + * Timestamp (ns) from the submitting thread that decided to wake us + * up; also used as a flag to indicate whether a wakeup is needed. + * + * Written by submitting threads with atomic64_cmpxchg, and by the + * worker thread setting to 0. + * + * If the value is 0, the worker is probably asleep; the submitting + * thread stores a non-zero value and becomes responsible for calling + * wake_up on the worker thread. If the value is non-zero, either the + * worker is running or another thread has the responsibility for + * issuing the wakeup. + * + * The "sleep" mode has periodic wakeups and the worker thread may + * happen to wake up while a completion is being enqueued. If that + * happens, the wakeup may be unneeded but will be attempted anyway. + * + * So the return value from cmpxchg(first_wakeup,0,nonzero) can always + * be done, and will tell the submitting thread whether to issue the + * wakeup or not; cmpxchg is atomic, so no other synchronization is + * needed. + * + * A timestamp is used rather than, say, 1, so that the worker thread + * could record stats on how long it takes to actually get the worker + * thread running. + * + * There is some redundancy between this and "idle" above. + */ + atomic64_t first_wakeup; + /* More padding for cache line separation */ + char pad2[CACHE_LINE_BYTES - sizeof(atomic64_t)]; + /* Last wakeup, in ns. */ + uint64_t most_recent_wakeup; +}; + +struct round_robin_work_queue { + struct vdo_work_queue common; + struct simple_work_queue **service_queues; + unsigned int num_service_queues; +}; -/** - * Find a simple work queue on which to operate. - * - * If the argument is already a simple work queue, use it. If it's a - * round-robin work queue, pick the next subordinate service queue and use it. - * - * @param queue a work queue (round-robin or simple) - * - * @return a simple work queue - **/ static inline struct simple_work_queue * -pick_simple_queue(struct vdo_work_queue *queue) +as_simple_work_queue(struct vdo_work_queue *queue) { - return (queue->round_robin_mode ? - next_service_queue(as_round_robin_work_queue(queue)) : - as_simple_work_queue(queue)); + return ((queue == NULL) ? + NULL : + container_of(queue, struct simple_work_queue, common)); } -// Processing normal work items. +static inline struct round_robin_work_queue * +as_round_robin_work_queue(struct vdo_work_queue *queue) +{ + return ((queue == NULL) ? + NULL : + container_of(queue, struct round_robin_work_queue, common)); +} -/** - * Scan the work queue's work item lists, and dequeue and return the next - * waiting work item, if any. +/* Processing normal completions. */ + +/* + * Dequeue and return the next waiting completion, if any. * * We scan the funnel queues from highest priority to lowest, once; there is - * therefore a race condition where a high-priority work item can be enqueued + * therefore a race condition where a high-priority completion can be enqueued * followed by a lower-priority one, and we'll grab the latter (but we'll catch * the high-priority item on the next call). If strict enforcement of * priorities becomes necessary, this function will need fixing. - * - * @param queue the work queue - * - * @return a work item pointer, or NULL - **/ -static struct vdo_work_item * -poll_for_work_item(struct simple_work_queue *queue) + */ +static struct vdo_completion * +poll_for_completion(struct simple_work_queue *queue) { - struct vdo_work_item *item = NULL; + struct vdo_completion *completion = NULL; int i; - for (i = READ_ONCE(queue->num_priority_lists) - 1; i >= 0; i--) { + for (i = queue->num_priority_lists - 1; i >= 0; i--) { struct funnel_queue_entry *link = funnel_queue_poll(queue->priority_lists[i]); if (link != NULL) { - item = container_of(link, - struct vdo_work_item, - work_queue_entry_link); + completion = container_of(link, + struct vdo_completion, + work_queue_entry_link); break; } } - return item; + return completion; } -/** - * Add a work item into the queue and wake the worker thread if it is waiting. - * - * @param queue The work queue - * @param item The work item to add - **/ -static void enqueue_work_queue_item(struct simple_work_queue *queue, - struct vdo_work_item *item) +static void enqueue_work_queue_completion(struct simple_work_queue *queue, + struct vdo_completion *completion) { - unsigned int priority; - - ASSERT_LOG_ONLY(item->my_queue == NULL, - "item %px (fn %px/%px) to enqueue (%px) is not already queued (%px)", - item, item->work, item->stats_function, queue, - item->my_queue); - if (ASSERT(item->action < WORK_QUEUE_ACTION_COUNT, - "action is in range for queue") != VDO_SUCCESS) { - item->action = 0; + ASSERT_LOG_ONLY(completion->my_queue == NULL, + "completion %px (fn %px) to enqueue (%px) is not already queued (%px)", + completion, + completion->callback, + queue, + completion->my_queue); + if (completion->priority == VDO_WORK_Q_DEFAULT_PRIORITY) { + completion->priority = queue->common.type->default_priority; } - priority = READ_ONCE(queue->priority_map[item->action]); - // Update statistics. - update_stats_for_enqueue(&queue->stats, item, priority); + if (ASSERT(completion->priority < queue->num_priority_lists, + "priority is in range for queue") != VDO_SUCCESS) { + completion->priority = 0; + } - item->my_queue = &queue->common; + completion->my_queue = &queue->common; - // Funnel queue handles the synchronization for the put. - funnel_queue_put(queue->priority_lists[priority], - &item->work_queue_entry_link); + /* Funnel queue handles the synchronization for the put. */ + funnel_queue_put(queue->priority_lists[completion->priority], + &completion->work_queue_entry_link); /* * Due to how funnel queue synchronization is handled (just atomic @@ -182,56 +211,40 @@ static void enqueue_work_queue_item(struct simple_work_queue *queue, atomic64_cmpxchg(&queue->first_wakeup, 0, ktime_get_ns()); - // Despite the name, there's a maximum of one thread in this list. + /* There's a maximum of one thread in this list. */ wake_up(&queue->waiting_worker_threads); } -/** - * Run any start hook that may be defined for the work queue. - * - * @param queue The work queue - **/ static void run_start_hook(struct simple_work_queue *queue) { - if (queue->type->start != NULL) { - queue->type->start(queue->private); + if (queue->common.type->start != NULL) { + queue->common.type->start(queue->private); } } -/** - * Run any finish hook that may be defined for the work queue. - * - * @param queue The work queue - **/ static void run_finish_hook(struct simple_work_queue *queue) { - if (queue->type->finish != NULL) { - queue->type->finish(queue->private); + if (queue->common.type->finish != NULL) { + queue->common.type->finish(queue->private); } } -/** - * Wait for the next work item to process, or until kthread_should_stop +/* + * Wait for the next completion to process, or until kthread_should_stop * indicates that it's time for us to shut down. * - * If kthread_should_stop says it's time to stop but we have pending work - * items, return a work item. + * If kthread_should_stop says it's time to stop but we have pending + * completions return a completion. * - * Update statistics relating to scheduler interactions. - * - * @param queue The work queue to wait on - * - * @return the next work item, or NULL to indicate shutdown is requested - **/ -static struct vdo_work_item * -wait_for_next_work_item(struct simple_work_queue *queue) + * Also update statistics relating to scheduler interactions. + */ +static struct vdo_completion * +wait_for_next_completion(struct simple_work_queue *queue) { - struct vdo_work_item *item; + struct vdo_completion *completion; DEFINE_WAIT(wait); while (true) { - uint64_t time_before_schedule, schedule_time_ns, run_time_ns; - atomic64_set(&queue->first_wakeup, 0); prepare_to_wait(&queue->waiting_worker_threads, &wait, @@ -247,10 +260,10 @@ wait_for_next_work_item(struct simple_work_queue *queue) * little.) */ atomic_set(&queue->idle, 1); - smp_mb(); // store-load barrier between "idle" and funnel queue + smp_mb(); /* store-load barrier between "idle" and funnel queue */ - item = poll_for_work_item(queue); - if (item != NULL) { + completion = poll_for_completion(queue); + if (completion != NULL) { break; } @@ -264,151 +277,64 @@ wait_for_next_work_item(struct simple_work_queue *queue) break; } - time_before_schedule = ktime_get_ns(); - run_time_ns = time_before_schedule - queue->most_recent_wakeup; - // These stats are read from other threads, but are only - // written by this thread. - WRITE_ONCE(queue->stats.waits, queue->stats.waits + 1); - WRITE_ONCE(queue->stats.run_time, - queue->stats.run_time + run_time_ns); - schedule(); - queue->most_recent_wakeup = ktime_get_ns(); - schedule_time_ns = (queue->most_recent_wakeup - - time_before_schedule); - enter_histogram_sample(queue->stats.schedule_time_histogram, - schedule_time_ns / 1000); - /* * Check again before resetting first_wakeup for more accurate * stats. If it was a spurious wakeup, continue looping. */ - item = poll_for_work_item(queue); - if (item != NULL) { + completion = poll_for_completion(queue); + if (completion != NULL) { break; } } - if (item != NULL) { - uint64_t first_wakeup = atomic64_read(&queue->first_wakeup); - /* - * We sometimes register negative wakeup latencies without this - * fencing. Whether it's forcing full serialization between the - * read of first_wakeup and the "rdtsc" that might be used - * depending on the clock source that helps, or some extra - * nanoseconds of delay covering for high-resolution clocks not - * being quite in sync between CPUs, is not yet clear. - */ - smp_rmb(); - if (first_wakeup != 0) { - enter_histogram_sample( - queue->stats.wakeup_latency_histogram, - (ktime_get_ns() - first_wakeup) / 1000); - enter_histogram_sample( - queue->stats.wakeup_queue_length_histogram, - count_vdo_work_items_pending( - &queue->stats.work_item_stats)); - } - } finish_wait(&queue->waiting_worker_threads, &wait); atomic_set(&queue->idle, 0); - return item; + return completion; } -/** - * Execute a work item from a work queue, and do associated bookkeeping. - * - * @param queue the work queue the item is from - * @param item the work item to run - **/ -static void process_work_item(struct simple_work_queue *queue, - struct vdo_work_item *item) +static void process_completion(struct simple_work_queue *queue, + struct vdo_completion *completion) { - uint64_t dequeue_time = update_stats_for_dequeue(&queue->stats, item); - // Save the index, so we can use it after the work function. - unsigned int index = item->stat_table_index; - - if (ASSERT(item->my_queue == &queue->common, - "item %px from queue %px marked as being in this queue (%px)", - item, queue, item->my_queue) == UDS_SUCCESS) { - item->my_queue = NULL; + if (ASSERT(completion->my_queue == &queue->common, + "completion %px from queue %px marked as being in this queue (%px)", + completion, + queue, + completion->my_queue) == UDS_SUCCESS) { + completion->my_queue = NULL; } - item->work(item); - // We just surrendered control of the work item; no more access. - item = NULL; - - update_vdo_work_item_stats_for_work_time(&queue->stats.work_item_stats, - index, - dequeue_time); + vdo_run_completion_callback(completion); } -/** - * Yield the CPU to the scheduler and update queue statistics accordingly. - * - * @param queue The active queue - **/ static void yield_to_scheduler(struct simple_work_queue *queue) { - unsigned int queue_length; - uint64_t run_time_ns, reschedule_time_ns; - uint64_t time_before_reschedule, time_after_reschedule; - struct vdo_work_queue_stats *stats = &queue->stats; - - /* - * Record the queue length we have *before* rescheduling. - * N.B.: We compute the pending count info here without any - * synchronization, but it's for stats reporting only, so being - * imprecise isn't too big a deal. - */ - queue_length = count_vdo_work_items_pending(&stats->work_item_stats); - - time_before_reschedule = ktime_get_ns(); cond_resched(); - time_after_reschedule = ktime_get_ns(); - - enter_histogram_sample(stats->reschedule_queue_length_histogram, - queue_length); - - run_time_ns = time_before_reschedule - queue->most_recent_wakeup; - enter_histogram_sample(stats->run_time_before_reschedule_histogram, - run_time_ns / 1000); - WRITE_ONCE(stats->run_time, stats->run_time + run_time_ns); - - reschedule_time_ns = time_after_reschedule - time_before_reschedule; - enter_histogram_sample(stats->reschedule_time_histogram, - reschedule_time_ns / 1000); - WRITE_ONCE(stats->reschedule_time, - stats->reschedule_time + reschedule_time_ns); - - queue->most_recent_wakeup = time_after_reschedule; + queue->most_recent_wakeup = ktime_get_ns(); } -/** - * Main loop of the work queue worker thread. - * - * Waits for work items and runs them, until told to stop. - * - * @param queue The work queue to run - **/ static void service_work_queue(struct simple_work_queue *queue) { run_start_hook(queue); while (true) { - struct vdo_work_item *item = poll_for_work_item(queue); - if (item == NULL) { - item = wait_for_next_work_item(queue); + struct vdo_completion *completion = poll_for_completion(queue); + + if (completion == NULL) { + completion = wait_for_next_completion(queue); } - if (item == NULL) { - // No work items but kthread_should_stop was triggered. + if (completion == NULL) { + /* + * No completions but kthread_should_stop() was + * triggered. + */ break; } - process_work_item(queue, item); + process_completion(queue, completion); /* * Be friendly to a CPU that has other work to do, if the @@ -423,20 +349,12 @@ static void service_work_queue(struct simple_work_queue *queue) run_finish_hook(queue); } -/** - * Initialize per-thread data for a new worker thread and run the work queue. - * Called in a new thread created by kthread_run(). - * - * @param ptr A pointer to the vdo_work_queue to run. - * - * @return 0 (indicating success to kthread_run()) - **/ static int work_queue_runner(void *ptr) { struct simple_work_queue *queue = ptr; unsigned long flags; - queue->stats.start_time = queue->most_recent_wakeup = ktime_get_ns(); + queue->most_recent_wakeup = ktime_get_ns(); spin_lock_irqsave(&queue->lock, flags); queue->started = true; @@ -448,27 +366,50 @@ static int work_queue_runner(void *ptr) return 0; } -// Preparing work items +/* Creation & teardown */ + +static void free_simple_work_queue(struct simple_work_queue *queue) +{ + unsigned int i; + + for (i = 0; i <= VDO_WORK_Q_MAX_PRIORITY; i++) { + free_funnel_queue(queue->priority_lists[i]); + } + UDS_FREE(queue->common.name); + UDS_FREE(queue); +} -/**********************************************************************/ -void setup_work_item(struct vdo_work_item *item, - vdo_work_function work, - void *stats_function, - unsigned int action) +static void free_round_robin_work_queue(struct round_robin_work_queue *queue) { - ASSERT_LOG_ONLY(item->my_queue == NULL, - "setup_work_item not called on enqueued work item"); - item->work = work; - item->stats_function = - ((stats_function == NULL) ? work : stats_function); - item->stat_table_index = 0; - item->action = action; - item->my_queue = NULL; + struct simple_work_queue **queue_table = queue->service_queues; + unsigned int count = queue->num_service_queues; + unsigned int i; + + queue->service_queues = NULL; + + for (i = 0; i < count; i++) { + free_simple_work_queue(queue_table[i]); + } + UDS_FREE(queue_table); + UDS_FREE(queue->common.name); + UDS_FREE(queue); } -// Creation & teardown +void free_work_queue(struct vdo_work_queue *queue) +{ + if (queue == NULL) { + return; + } + + finish_work_queue(queue); + + if (queue->round_robin_mode) { + free_round_robin_work_queue(as_round_robin_work_queue(queue)); + } else { + free_simple_work_queue(as_simple_work_queue(queue)); + } +} -/**********************************************************************/ static bool queue_started(struct simple_work_queue *queue) { unsigned long flags; @@ -481,78 +422,35 @@ static bool queue_started(struct simple_work_queue *queue) return started; } -/** - * Create a simple work queue with a worker thread. - * - * @param [in] thread_name_prefix The per-device prefix to use in - * thread names - * @param [in] name The queue name - * @param [in] owner The VDO owning the work queue - * @param [in] private Private data of the queue for use by work - * items or other queue-specific functions - * @param [in] type The work queue type defining the lifecycle - * functions, queue actions, priorities, and - * timeout behavior - * @param [out] queue_ptr Where to store the queue handle - * - * @return VDO_SUCCESS or an error code - **/ static int make_simple_work_queue(const char *thread_name_prefix, const char *name, - struct vdo *owner, + struct vdo_thread *owner, void *private, const struct vdo_work_queue_type *type, struct simple_work_queue **queue_ptr) { struct simple_work_queue *queue; - unsigned int num_priority_lists = 1; int i; struct task_struct *thread = NULL; + int result; - int result = UDS_ALLOCATE(1, - struct simple_work_queue, - "simple work queue", - &queue); + ASSERT_LOG_ONLY((type->max_priority <= VDO_WORK_Q_MAX_PRIORITY), + "queue priority count %u within limit %u", + type->max_priority, + VDO_WORK_Q_MAX_PRIORITY); + + result = UDS_ALLOCATE(1, + struct simple_work_queue, + "simple work queue", + &queue); if (result != UDS_SUCCESS) { return result; } - queue->type = type; queue->private = private; + queue->common.type = type; queue->common.owner = owner; - for (i = 0; i < WORK_QUEUE_ACTION_COUNT; i++) { - const struct vdo_work_queue_action *action = - &queue->type->action_table[i]; - unsigned int code, priority; - if (action->name == NULL) { - break; - } - code = action->code; - priority = action->priority; - - result = ASSERT( - code < WORK_QUEUE_ACTION_COUNT, - "invalid action code %u in work queue initialization", - code); - if (result != VDO_SUCCESS) { - UDS_FREE(queue); - return result; - } - result = ASSERT( - priority < WORK_QUEUE_PRIORITY_COUNT, - "invalid action priority %u in work queue initialization", - priority); - if (result != VDO_SUCCESS) { - UDS_FREE(queue); - return result; - } - queue->priority_map[code] = priority; - if (num_priority_lists <= priority) { - num_priority_lists = priority + 1; - } - } - result = uds_duplicate_string(name, "queue name", &queue->common.name); if (result != VDO_SUCCESS) { UDS_FREE(queue); @@ -563,24 +461,16 @@ static int make_simple_work_queue(const char *thread_name_prefix, init_waitqueue_head(&queue->start_waiters); spin_lock_init(&queue->lock); - queue->num_priority_lists = num_priority_lists; - for (i = 0; i < WORK_QUEUE_PRIORITY_COUNT; i++) { + queue->num_priority_lists = type->max_priority + 1; + for (i = 0; i < queue->num_priority_lists; i++) { result = make_funnel_queue(&queue->priority_lists[i]); if (result != UDS_SUCCESS) { free_simple_work_queue(queue); return result; } } - result = initialize_work_queue_stats(&queue->stats, NULL); - if (result != 0) { - uds_log_error("Cannot initialize statistics tracking: %d", - result); - free_simple_work_queue(queue); - return result; - } queue->started = false; - thread = kthread_run(work_queue_runner, queue, "%s:%s", @@ -608,11 +498,18 @@ static int make_simple_work_queue(const char *thread_name_prefix, return UDS_SUCCESS; } -/**********************************************************************/ +/** + * Create a work queue; if multiple threads are requested, completions will be + * distributed to them in round-robin fashion. + * + * Each queue is associated with a struct vdo_thread which has a single vdo + * thread id. Regardless of the actual number of queues and threads allocated + * here, code outside of the queue implementation will treat this as a single + * zone. + */ int make_work_queue(const char *thread_name_prefix, const char *name, - struct vdo *owner, - void *private, + struct vdo_thread *owner, const struct vdo_work_queue_type *type, unsigned int thread_count, void *thread_privates[], @@ -625,14 +522,15 @@ int make_work_queue(const char *thread_name_prefix, if (thread_count == 1) { struct simple_work_queue *simple_queue; - void *context = (thread_privates != NULL) ? thread_privates[0] : - private; + void *context = ((thread_privates != NULL) + ? thread_privates[0] + : NULL); result = make_simple_work_queue(thread_name_prefix, - name, - owner, - context, - type, - &simple_queue); + name, + owner, + context, + type, + &simple_queue); if (result == VDO_SUCCESS) { *queue_ptr = &simple_queue->common; } @@ -668,8 +566,9 @@ int make_work_queue(const char *thread_name_prefix, *queue_ptr = &queue->common; for (i = 0; i < thread_count; i++) { - void *context = (thread_privates != NULL) ? thread_privates[i] : - private; + void *context = ((thread_privates != NULL) + ? thread_privates[i] + : NULL); snprintf(thread_name, sizeof(thread_name), "%s%u", name, i); result = make_simple_work_queue(thread_name_prefix, thread_name, @@ -679,7 +578,7 @@ int make_work_queue(const char *thread_name_prefix, &queue->service_queues[i]); if (result != VDO_SUCCESS) { queue->num_service_queues = i; - // Destroy previously created subordinates. + /* Destroy previously created subordinates. */ free_work_queue(UDS_FORGET(*queue_ptr)); return result; } @@ -689,36 +588,27 @@ int make_work_queue(const char *thread_name_prefix, return VDO_SUCCESS; } -/** - * Shut down a simple work queue's worker thread. - * - * @param queue The work queue to shut down - **/ static void finish_simple_work_queue(struct simple_work_queue *queue) { if (queue->thread == NULL) { return; } - // Reduces (but does not eliminate) the chance of the sysfs support - // reporting the pid even after the thread is gone. + /* + * Reduces (but does not eliminate) the chance of the sysfs support + * reporting the pid even after the thread is gone. + */ WRITE_ONCE(queue->thread_pid, 0); - // Tells the worker thread to shut down and waits for it to exit. + /* Tells the worker thread to shut down and waits for it to exit. */ kthread_stop(queue->thread); queue->thread = NULL; } -/** - * Shut down a round-robin work queue's service queues. - * - * @param queue The work queue to shut down - **/ static void finish_round_robin_work_queue(struct round_robin_work_queue *queue) { struct simple_work_queue **queue_table = queue->service_queues; unsigned int count = queue->num_service_queues; - unsigned int i; for (i = 0; i < count; i++) { @@ -726,7 +616,9 @@ static void finish_round_robin_work_queue(struct round_robin_work_queue *queue) } } -/**********************************************************************/ +/* + * No enqueueing of completions should be done once this function is called. + */ void finish_work_queue(struct vdo_work_queue *queue) { if (queue == NULL) { @@ -734,71 +626,16 @@ void finish_work_queue(struct vdo_work_queue *queue) } if (queue->round_robin_mode) { - finish_round_robin_work_queue(as_round_robin_work_queue(queue)); + struct round_robin_work_queue *rrqueue + = as_round_robin_work_queue(queue); + finish_round_robin_work_queue(rrqueue); } else { finish_simple_work_queue(as_simple_work_queue(queue)); } } -/** - * Tear down a simple work queue, and decrement the kobject reference - * count on it. - * - * @param queue The work queue - **/ -static void free_simple_work_queue(struct simple_work_queue *queue) -{ - unsigned int i; - - for (i = 0; i < WORK_QUEUE_PRIORITY_COUNT; i++) { - free_funnel_queue(queue->priority_lists[i]); - } - cleanup_work_queue_stats(&queue->stats); - UDS_FREE(queue->common.name); - UDS_FREE(queue); -} - -/** - * Tear down a round-robin work queue and its service queues, and - * decrement the kobject reference count on it. - * - * @param queue The work queue - **/ -static void free_round_robin_work_queue(struct round_robin_work_queue *queue) -{ - struct simple_work_queue **queue_table = queue->service_queues; - unsigned int count = queue->num_service_queues; - unsigned int i; - - queue->service_queues = NULL; - - for (i = 0; i < count; i++) { - free_simple_work_queue(queue_table[i]); - } - UDS_FREE(queue_table); - UDS_FREE(queue->common.name); - UDS_FREE(queue); -} - -/**********************************************************************/ -void free_work_queue(struct vdo_work_queue *queue) -{ - if (queue == NULL) { - return; - } - - finish_work_queue(queue); +/* Debugging dumps */ - if (queue->round_robin_mode) { - free_round_robin_work_queue(as_round_robin_work_queue(queue)); - } else { - free_simple_work_queue(as_simple_work_queue(queue)); - } -} - -// Debugging dumps - -/**********************************************************************/ static void dump_simple_work_queue(struct simple_work_queue *queue) { const char *thread_status = "no threads"; @@ -809,22 +646,23 @@ static void dump_simple_work_queue(struct simple_work_queue *queue) thread_status = atomic_read(&queue->idle) ? "idle" : "running"; } - uds_log_info("workQ %px (%s) %u entries %llu waits, %s (%c)", + uds_log_info("workQ %px (%s) %s (%c)", &queue->common, queue->common.name, - count_vdo_work_items_pending(&queue->stats.work_item_stats), - READ_ONCE(queue->stats.waits), thread_status, task_state_report); - log_vdo_work_item_stats(&queue->stats.work_item_stats); - log_work_queue_stats(queue); - - // ->lock spin lock status? - // ->waiting_worker_threads wait queue status? anyone waiting? + /* + * ->lock spin lock status? + * ->waiting_worker_threads wait queue status? anyone waiting? + */ } -/**********************************************************************/ +/** + * Write to the buffer some info about the completion, for logging. Since the + * common use case is dumping info about a lot of completions to syslog all at + * once, the format favors brevity over readability. + */ void dump_work_queue(struct vdo_work_queue *queue) { if (queue->round_robin_mode) { @@ -841,78 +679,159 @@ void dump_work_queue(struct vdo_work_queue *queue) } } -/**********************************************************************/ -void dump_work_item_to_buffer(struct vdo_work_item *item, +static void get_function_name(void *pointer, char *buffer, - size_t length) + size_t buffer_length) +{ + if (pointer == NULL) { + /* + * Format "%ps" logs a null pointer as "(null)" with a bunch of + * leading spaces. We sometimes use this when logging lots of + * data; don't be so verbose. + */ + strncpy(buffer, "-", buffer_length); + } else { + /* + * Use a non-const array instead of a string literal below to + * defeat gcc's format checking, which doesn't understand that + * "%ps" actually does support a precision spec in Linux kernel + * code. + */ + static char truncated_function_name_format_string[] = "%.*ps"; + char *space; + + snprintf(buffer, + buffer_length, + truncated_function_name_format_string, + buffer_length - 1, + pointer); + + space = strchr(buffer, ' '); + + if (space != NULL) { + *space = '\0'; + } + } +} + +void dump_completion_to_buffer(struct vdo_completion *completion, + char *buffer, + size_t length) { size_t current_length = scnprintf(buffer, length, "%.*s/", TASK_COMM_LEN, - item->my_queue == NULL ? "-" : item->my_queue->name); + (completion->my_queue == NULL ? + "-" : + completion->my_queue->name)); if (current_length < length) { - vdo_get_function_name(item->stats_function, - buffer + current_length, - length - current_length); + get_function_name((void *) completion->callback, + buffer + current_length, + length - current_length); } } -// Work submission - -/**********************************************************************/ +/* Completion submission */ +/* + * If the completion has a timeout that has already passed, the timeout handler + * function may be invoked by this function. + */ void enqueue_work_queue(struct vdo_work_queue *queue, - struct vdo_work_item *item) + struct vdo_completion *completion) { - enqueue_work_queue_item(pick_simple_queue(queue), item); -} + /* + * Convert the provided generic vdo_work_queue to the simple_work_queue + * to actually queue on. + */ + struct simple_work_queue *simple_queue = NULL; -// Misc + if (!queue->round_robin_mode) { + simple_queue = as_simple_work_queue(queue); + } else { + struct round_robin_work_queue *round_robin + = as_round_robin_work_queue(queue); + /* + * It shouldn't be a big deal if the same rotor gets used for + * multiple work queues. Any patterns that might develop are + * likely to be disrupted by random ordering of multiple + * completions and migration between cores, unless the load is + * so light as to be regular in ordering of tasks and the + * threads are confined to individual cores; with a load that + * light we won't care. + */ + unsigned int rotor = this_cpu_inc_return(service_queue_rotor); + unsigned int index = rotor % round_robin->num_service_queues; -/** - * Return the work queue pointer recorded at initialization time in - * the work-queue stack handle initialized on the stack of the current - * thread, if any. - * - * @return the work queue pointer, or NULL - **/ + simple_queue = round_robin->service_queues[index]; + } + + enqueue_work_queue_completion(simple_queue, completion); +} + +/* Misc */ + +/* + * Return the work queue pointer recorded at initialization time in the + * work-queue stack handle initialized on the stack of the current thread, if + * any. + */ static struct simple_work_queue *get_current_thread_work_queue(void) { /* - * The kthreadd process has the PF_KTHREAD flag set but a null - * "struct kthread" pointer, which breaks the (initial) - * implementation of kthread_func, which assumes the pointer - * is always non-null. This matters if memory reclamation is - * triggered and causes calls into VDO that get - * here. [VDO-5194] + * In interrupt context, if a vdo thread is what got interrupted, the + * calls below will find the queue for the thread which was + * interrupted. However, the interrupted thread may have been + * processing a completion, in which case starting to process another + * would violate our concurrency assumptions. + */ + if (in_interrupt()) { + return NULL; + } + +#if LINUX_VERSION_CODE < KERNEL_VERSION(5,13,0) + /* + * The kthreadd process has the PF_KTHREAD flag set but a null "struct + * kthread" pointer, which breaks the (initial) implementation of + * kthread_func, which assumes the pointer is always non-null. This + * matters if memory reclamation is triggered and causes calls into VDO + * that get here. [VDO-5194] * * There might also be a similar reclamation issue in the - * usermodehelper code path before exec is called, and/or - * kthread setup when allocating the kthread struct itself. + * usermodehelper code path before exec is called, and/or kthread setup + * when allocating the kthread struct itself. * - * So we check for the null pointer first. The kthread code - * overloads the set_child_tid field to use for its pointer in - * PF_KTHREAD processes. (If PF_KTHREAD is clear, kthread_func - * will return null anyway so we needn't worry about that + * So we check for the null pointer first, on older kernels. The + * kthread code initially overloaded the set_child_tid field to use for + * its pointer in PF_KTHREAD processes. (If PF_KTHREAD is clear, + * kthread_func will return null anyway so we needn't worry about that * case.) * - * FIXME: When submitting upstream, make sure kthread_func is - * fixed instead, and drop this check. + * This bug was fixed in the 5.13 kernel release, but the 5.17 kernel + * release changed the task structure field used such that this + * workaround will break things on newer kernels. It shows up as a null + * pointer returned for the current work queue even when running in the + * work queue thread. + * + * Any backports of the 5.13 fix to custom pre-5.13 kernels should have + * no problem with this. Backports of the 5.17 change to 5.13 and later + * should be okay with this #if check; backports to pre-5.13 will need + * further protection. */ if (current->set_child_tid == NULL) { return NULL; } +#endif if (kthread_func(current) != work_queue_runner) { - // Not a VDO workQueue thread. + /* Not a VDO work queue thread. */ return NULL; } return kthread_data(current); } -/**********************************************************************/ struct vdo_work_queue *get_current_work_queue(void) { struct simple_work_queue *queue = get_current_thread_work_queue(); @@ -920,16 +839,23 @@ struct vdo_work_queue *get_current_work_queue(void) return (queue == NULL) ? NULL : &queue->common; } -/**********************************************************************/ -struct vdo *get_work_queue_owner(struct vdo_work_queue *queue) +struct vdo_thread *get_work_queue_owner(struct vdo_work_queue *queue) { return queue->owner; } -/**********************************************************************/ +/** + * Returns the private data for the current thread's work queue, or NULL if + * none or if the current thread is not a work queue thread. + */ void *get_work_queue_private_data(void) { struct simple_work_queue *queue = get_current_thread_work_queue(); return (queue != NULL) ? queue->private : NULL; } + +bool vdo_work_queue_type_is(struct vdo_work_queue *queue, + const struct vdo_work_queue_type *type) { + return (queue->type == type); +} diff --git a/vdo/workQueue.h b/vdo/workQueue.h index 4c229bab..3d38e196 100644 --- a/vdo/workQueue.h +++ b/vdo/workQueue.h @@ -1,272 +1,56 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/kernel/workQueue.h#17 $ */ #ifndef VDO_WORK_QUEUE_H #define VDO_WORK_QUEUE_H -#include #include /* for TASK_COMM_LEN */ -#include "kernelTypes.h" -#include "util/funnelQueue.h" +#include "funnel-queue.h" -enum { - MAX_QUEUE_NAME_LEN = TASK_COMM_LEN, - /** Maximum number of action definitions per work queue type */ - WORK_QUEUE_ACTION_COUNT = 8, - /** Number of priority values available */ - WORK_QUEUE_PRIORITY_COUNT = 4, -}; - -struct vdo_work_item { - /** Entry link for lock-free work queue */ - struct funnel_queue_entry work_queue_entry_link; - /** Function to be called */ - vdo_work_function work; - /** Optional alternate function for display in queue stats */ - void *stats_function; - /** - * An index into the statistics table; filled in by workQueueStats code - */ - unsigned int stat_table_index; - /** - * The action code given to setup_work_item, from which a priority will - * be determined. - **/ - unsigned int action; - /** - * The work queue in which the item is enqueued, or NULL if not - * enqueued. - */ - struct vdo_work_queue *my_queue; - /** - * Time of enqueueing, in ns, for recording queue (waiting) time stats - */ - uint64_t enqueue_time; -}; +#include "kernel-types.h" +#include "types.h" -/** - * Table entries defining an action. - * - * Actions are intended to distinguish general classes of activity for - * prioritization purposes, but not necessarily to indicate specific work - * functions. They are indicated to setup_work_item numerically, using an - * enumerator defined per kind of work queue -- bio submission work queue - * actions use bio_q_action, cpu actions use cpu_q_action, etc. For example, - * for the CPU work queues, data compression can be prioritized separately - * from final cleanup processing of a vio or from dedupe verification; base - * code threads prioritize all VIO callback invocation the same, but separate - * from sync or heartbeat operations. The bio acknowledgement work queue, on - * the other hand, only does one thing, so it only defines one action code. - * - * Action codes values must be small integers, 0 through - * WORK_QUEUE_ACTION_COUNT-1, and should not be duplicated for a queue type. - * - * A table of vdo_work_queue_action entries embedded in struct - * vdo_work_queue_type specifies the name, code, and priority for each type - * of action in the work queue. The table can have at most - * WORK_QUEUE_ACTION_COUNT entries, but a NULL name indicates an earlier end - * to the table. - * - * Priorities may be specified as values from 0 through - * WORK_QUEUE_PRIORITY_COUNT-1, higher values indicating higher priority. - * Priorities are just strong suggestions; it's possible for a lower-priority - * work item scheduled right after a high-priority one to be run first, if the - * worker thread happens to be scanning its queues at just the wrong moment, - * but the high-priority item will be picked up next. - * - * Internally, the priorities in this table are used to initialize another - * table in the constructed work queue object, and in internal builds, - * device-mapper messages can be sent to change the priority for an action, - * identified by name, in a running VDO device. Doing so does not affect the - * priorities for other devices, or for future VDO device creation. - **/ -struct vdo_work_queue_action { - /** Name of the action */ - char *name; - - /** The action code (per-type enum) */ - unsigned int code; - - /** The initial priority for this action */ - unsigned int priority; +enum { + MAX_VDO_WORK_QUEUE_NAME_LEN = TASK_COMM_LEN, }; -/** - * Static attributes of a work queue that are fixed at compile time - * for a given call site. (Attributes that may be computed at run time - * are passed as separate arguments.) - **/ struct vdo_work_queue_type { - /** A function to call in the new thread before servicing requests */ void (*start)(void *); - - /** A function to call in the new thread when shutting down */ void (*finish)(void *); - - /** Table of actions for this work queue */ - struct vdo_work_queue_action action_table[WORK_QUEUE_ACTION_COUNT]; + enum vdo_completion_priority max_priority; + enum vdo_completion_priority default_priority; }; -/** - * Create a work queue. - * - * If multiple threads are requested, work items will be distributed to them in - * round-robin fashion. - * - * @param [in] thread_name_prefix The per-device prefix to use in thread - * names - * @param [in] name The queue name - * @param [in] owner The VDO owning the work queue - * @param [in] private Private data of the queue for use by work - * items or other queue-specific functions - * @param [in] thread_privates If non-NULL, an array of separate private - * data pointers, one for each service thread, - * to use instead of sharing 'private' - * @param [in] type The work queue type defining the lifecycle - * functions, queue actions, priorities, and - * timeout behavior - * @param [in] thread_count Number of service threads to set up - * @param [out] queue_ptr Where to store the queue handle - * - * @return VDO_SUCCESS or an error code - **/ int make_work_queue(const char *thread_name_prefix, const char *name, - struct vdo *owner, - void *private, + struct vdo_thread *owner, const struct vdo_work_queue_type *type, unsigned int thread_count, void *thread_privates[], struct vdo_work_queue **queue_ptr); -/** - * Set up the fields of a work queue item. - * - * Before the first setup call (setup_work_item), the work item must - * have been initialized to all-zero. Resetting a previously-used work - * item does not require another memset. - * - * The action code is typically defined in a work-queue-type-specific - * enumeration; see the description of struct vdo_work_queue_action. - * - * @param item The work item to initialize - * @param work The function pointer to execute - * @param stats_function A function pointer to record for stats, or NULL - * @param action Action code, for determination of priority - **/ -void setup_work_item(struct vdo_work_item *item, - vdo_work_function work, - void *stats_function, - unsigned int action); - -/** - * Add a work item to a work queue. - * - * If the work item has a timeout that has already passed, the timeout - * handler function may be invoked at this time. - * - * @param queue The queue handle - * @param item The work item to be processed - **/ void enqueue_work_queue(struct vdo_work_queue *queue, - struct vdo_work_item *item); + struct vdo_completion *completion); -/** - * Shut down a work queue's worker thread. - * - * Alerts the worker thread that it should shut down, and then waits - * for it to do so. - * - * There should not be any new enqueueing of work items done once this - * function is called. - * - * @param queue The work queue to shut down (may be NULL) - **/ void finish_work_queue(struct vdo_work_queue *queue); -/** - * Free a work queue. - * - * @param queue The work queue to free - **/ void free_work_queue(struct vdo_work_queue *queue); -/** - * Print work queue state and statistics to the kernel log. - * - * @param queue The work queue to examine - **/ void dump_work_queue(struct vdo_work_queue *queue); -/** - * Write to the buffer some info about the work item, for logging. - * Since the common use case is dumping info about a lot of work items - * to syslog all at once, the format favors brevity over readability. - * - * @param item The work item - * @param buffer The message buffer to fill in - * @param length The length of the message buffer - **/ -void dump_work_item_to_buffer(struct vdo_work_item *item, - char *buffer, - size_t length); - +void dump_completion_to_buffer(struct vdo_completion *completion, + char *buffer, + size_t length); -/** - * Checks whether two work items have the same action codes - * - * @param item1 The first item - * @param item2 The second item - * - * @return TRUE if the actions are the same, FALSE otherwise - */ -static inline bool are_work_item_actions_equal(struct vdo_work_item *item1, - struct vdo_work_item *item2) -{ - return item1->action == item2->action; -} - -/** - * Returns the private data for the current thread's work queue. - * - * @return The private data pointer, or NULL if none or if the current - * thread is not a work queue thread. - **/ void *get_work_queue_private_data(void); - -/** - * Returns the work queue pointer for the current thread, if any. - * - * @return The work queue pointer or NULL - **/ struct vdo_work_queue *get_current_work_queue(void); +struct vdo_thread *get_work_queue_owner(struct vdo_work_queue *queue); -/** - * Returns the VDO that owns the work queue. - * - * @param queue The work queue - * - * @return The owner pointer supplied at work queue creation - **/ -struct vdo *get_work_queue_owner(struct vdo_work_queue *queue); +bool __must_check +vdo_work_queue_type_is(struct vdo_work_queue *queue, + const struct vdo_work_queue_type *type); #endif /* VDO_WORK_QUEUE_H */ diff --git a/vdo/workQueueInternals.h b/vdo/workQueueInternals.h deleted file mode 100644 index 98765515..00000000 --- a/vdo/workQueueInternals.h +++ /dev/null @@ -1,180 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/kernel/workQueueInternals.h#8 $ - */ - -#ifndef WORK_QUEUE_INTERNALS_H -#define WORK_QUEUE_INTERNALS_H - -#include -#include -#include -#include -#include - -#include "workItemStats.h" -#include "workQueueStats.h" - -struct vdo_work_item_list { - struct vdo_work_item *tail; -}; - -/** - * Work queue definition. - * - * There are two types of work queues: simple, with one worker thread, and - * round-robin, which uses a group of the former to do the work, and assigns - * work to them in round-robin fashion (roughly). Externally, both are - * represented via the same common sub-structure, though there's actually not a - * great deal of overlap between the two types internally. - **/ -struct vdo_work_queue { - /** Name of just the work queue (e.g., "cpuQ12") */ - char *name; - /** - * Whether this is a round-robin work queue or a simple (one-thread) - * work queue. - **/ - bool round_robin_mode; - /** A handle to a sysfs tree for reporting stats and other info */ - struct kobject kobj; - /** The VDO owning this work queue */ - struct vdo *owner; -}; - -struct simple_work_queue { - /** Common work queue bits */ - struct vdo_work_queue common; - /** A copy of .thread->pid, for safety in the sysfs support */ - pid_t thread_pid; - /** - * Number of priorities actually used, so we don't keep re-checking - * unused funnel queues. - **/ - unsigned int num_priority_lists; - /** - * Map from action codes to priorities. - * - * This mapping can be changed at run time in internal builds, for - * tuning purposes. - **/ - uint8_t priority_map[WORK_QUEUE_ACTION_COUNT]; - /** The funnel queues */ - struct funnel_queue *priority_lists[WORK_QUEUE_PRIORITY_COUNT]; - /** The kernel thread */ - struct task_struct *thread; - /** Life cycle functions, etc */ - const struct vdo_work_queue_type *type; - /** Opaque private data pointer, defined by higher level code */ - void *private; - /** In a subordinate work queue, a link back to the round-robin parent - */ - struct vdo_work_queue *parent_queue; - /** Padding for cache line separation */ - char pad[CACHE_LINE_BYTES - sizeof(struct vdo_work_queue *)]; - /** - * Lock protecting priority_map, num_priority_lists, started - */ - spinlock_t lock; - /** Any worker threads (zero or one) waiting for new work to do */ - wait_queue_head_t waiting_worker_threads; - /** - * Hack to reduce wakeup calls if the worker thread is running. See - * comments in workQueue.c. - * - * There is a lot of redundancy with "first_wakeup", though, and the - * pair should be re-examined. - **/ - atomic_t idle; - /** Wait list for synchronization during worker thread startup */ - wait_queue_head_t start_waiters; - /** Worker thread status (boolean) */ - bool started; - - /** - * Timestamp (ns) from the submitting thread that decided to wake us - * up; also used as a flag to indicate whether a wakeup is needed. - * - * Written by submitting threads with atomic64_cmpxchg, and by the - * worker thread setting to 0. - * - * If the value is 0, the worker is probably asleep; the submitting - * thread stores a non-zero value and becomes responsible for calling - * wake_up on the worker thread. If the value is non-zero, either the - * worker is running or another thread has the responsibility for - * issuing the wakeup. - * - * The "sleep" mode has periodic wakeups and the worker thread may - * happen to wake up while a work item is being enqueued. If that - * happens, the wakeup may be unneeded but will be attempted anyway. - * - * So the return value from cmpxchg(first_wakeup,0,nonzero) can always - * be done, and will tell the submitting thread whether to issue the - * wakeup or not; cmpxchg is atomic, so no other synchronization is - * needed. - * - * A timestamp is used rather than, say, 1, so that the worker thread - * can record stats on how long it takes to actually get the worker - * thread running. - * - * There is some redundancy between this and "idle" above. - **/ - atomic64_t first_wakeup; - /** Padding for cache line separation */ - char pad2[CACHE_LINE_BYTES - sizeof(atomic64_t)]; - /** Scheduling and work-function statistics */ - struct vdo_work_queue_stats stats; - /** Last time (ns) the scheduler actually woke us up */ - uint64_t most_recent_wakeup; -}; - -struct round_robin_work_queue { - /** Common work queue bits */ - struct vdo_work_queue common; - /** Simple work queues, for actually getting stuff done */ - struct simple_work_queue **service_queues; - /** Number of subordinate work queues */ - unsigned int num_service_queues; -}; - -static inline struct simple_work_queue * -as_simple_work_queue(struct vdo_work_queue *queue) -{ - return ((queue == NULL) ? - NULL : - container_of(queue, struct simple_work_queue, common)); -} - -static inline const struct simple_work_queue * -as_const_simple_work_queue(const struct vdo_work_queue *queue) -{ - return ((queue == NULL) ? - NULL : - container_of(queue, struct simple_work_queue, common)); -} - -static inline struct round_robin_work_queue * -as_round_robin_work_queue(struct vdo_work_queue *queue) -{ - return ((queue == NULL) ? - NULL : - container_of(queue, struct round_robin_work_queue, common)); -} - -#endif // WORK_QUEUE_INTERNALS_H diff --git a/vdo/workQueueStats.c b/vdo/workQueueStats.c deleted file mode 100644 index df7da77c..00000000 --- a/vdo/workQueueStats.c +++ /dev/null @@ -1,160 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/kernel/workQueueStats.c#7 $ - */ - -#include "workQueueStats.h" - -#include "memoryAlloc.h" - -#include "logger.h" -#include "workItemStats.h" -#include "workQueueInternals.h" - -/**********************************************************************/ -int initialize_work_queue_stats(struct vdo_work_queue_stats *stats, - struct kobject *queue_kobject) -{ - initialize_vdo_work_item_stats(&stats->work_item_stats); - - if (queue_kobject == NULL) { - return 0; - } - - stats->queue_time_histogram = - make_logarithmic_histogram(queue_kobject, "queue_time", - "Queue Time", "work items", - "wait time", "microseconds", 9); - if (stats->queue_time_histogram == NULL) { - return -ENOMEM; - } - - stats->reschedule_queue_length_histogram = - make_logarithmic_histogram(queue_kobject, - "reschedule_queue_length", - "Reschedule Queue Length", "calls", - "queued work items", NULL, 4); - if (stats->reschedule_queue_length_histogram == NULL) { - return -ENOMEM; - } - - stats->reschedule_time_histogram = - make_logarithmic_histogram(queue_kobject, "reschedule_time", - "Reschedule Time", "calls", - "sleep interval", "microseconds", - 9); - if (stats->reschedule_time_histogram == NULL) { - return -ENOMEM; - } - - stats->run_time_before_reschedule_histogram = - make_logarithmic_histogram(queue_kobject, - "run_time_before_reschedule", - "Run Time Before Reschedule", - "calls", "run time", "microseconds", - 9); - if (stats->run_time_before_reschedule_histogram == NULL) { - return -ENOMEM; - } - - stats->schedule_time_histogram = - make_logarithmic_histogram(queue_kobject, "schedule_time", - "Schedule Time", "calls", - "sleep interval", "microseconds", - 9); - if (stats->schedule_time_histogram == NULL) { - return -ENOMEM; - } - - stats->wakeup_latency_histogram = - make_logarithmic_histogram(queue_kobject, "wakeup_latency", - "Wakeup Latency", "wakeups", - "latency", "microseconds", 9); - if (stats->wakeup_latency_histogram == NULL) { - return -ENOMEM; - } - - stats->wakeup_queue_length_histogram = - make_logarithmic_histogram(queue_kobject, - "wakeup_queue_length", - "Wakeup Queue Length", "wakeups", - "queued work items", NULL, 4); - if (stats->wakeup_queue_length_histogram == NULL) { - return -ENOMEM; - } - - return 0; -} - -/**********************************************************************/ -void cleanup_work_queue_stats(struct vdo_work_queue_stats *stats) -{ - free_histogram(UDS_FORGET(stats->queue_time_histogram)); - free_histogram(UDS_FORGET(stats->reschedule_queue_length_histogram)); - free_histogram(UDS_FORGET(stats->reschedule_time_histogram)); - free_histogram(UDS_FORGET(stats->run_time_before_reschedule_histogram)); - free_histogram(UDS_FORGET(stats->schedule_time_histogram)); - free_histogram(UDS_FORGET(stats->wakeup_latency_histogram)); - free_histogram(UDS_FORGET(stats->wakeup_queue_length_histogram)); -} - -/**********************************************************************/ -void log_work_queue_stats(const struct simple_work_queue *queue) -{ - uint64_t total_processed, runtime_ns = 0; - unsigned long runtime_ms, ns_per_work_item = 0; - - if (queue->thread != NULL) { - runtime_ns = READ_ONCE(queue->thread->se.sum_exec_runtime); - } - runtime_ms = runtime_ns / 1000; - - total_processed = - count_vdo_work_items_processed(&queue->stats.work_item_stats); - if (total_processed > 0) { - ns_per_work_item = runtime_ns / total_processed; - } - - uds_log_info("workQ %px (%s) thread cpu usage %lu.%06lus, %llu tasks, %lu.%03luus/task", - queue, - queue->common.name, - runtime_ms / 1000000, - runtime_ms % 1000000, - total_processed, - ns_per_work_item / 1000, - ns_per_work_item % 1000); -} - -/**********************************************************************/ -ssize_t format_run_time_stats(const struct vdo_work_queue_stats *stats, - char *buffer) -{ - // Get snapshots of all three at approximately the same time. - uint64_t start_time = stats->start_time; - uint64_t run_time = READ_ONCE(stats->run_time); - uint64_t reschedule_time = READ_ONCE(stats->reschedule_time); - uint64_t now, lifetime; - - smp_rmb(); // rdtsc barrier - now = ktime_get_ns(); - lifetime = now - start_time; - - return sprintf(buffer, "%llu %llu %llu\n", - lifetime, run_time, reschedule_time); -} diff --git a/vdo/workQueueStats.h b/vdo/workQueueStats.h deleted file mode 100644 index 31164149..00000000 --- a/vdo/workQueueStats.h +++ /dev/null @@ -1,166 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/kernel/workQueueStats.h#5 $ - */ - -#ifndef WORK_QUEUE_STATS_H -#define WORK_QUEUE_STATS_H - -#include "workQueue.h" - -#include "timeUtils.h" - -#include "histogram.h" -#include "workItemStats.h" - -// Defined in workQueueInternals.h after inclusion of workQueueStats.h. -struct simple_work_queue; - -/* - * Tracking statistics. - * - * Cache line contention issues: - * - * In work_item_stats, there are read-only fields accessed mostly by - * work submitters, then fields updated by the work submitters (for - * which there will be contention), then fields rarely if ever updated - * (more than two cache lines' worth), then fields updated only by the - * worker thread. The trailing fields here are updated only by the - * worker thread. - */ -struct vdo_work_queue_stats { - // Per-work-function counters and optional nanosecond timing data - struct vdo_work_item_stats work_item_stats; - // How often we go to sleep waiting for work - uint64_t waits; - - // Run time data, for monitoring utilization levels. - - // Thread start time, from which we can compute lifetime thus far. - uint64_t start_time; - /* - * Time the thread has not been blocked waiting for a new work item, - * nor in cond_resched(). This will include time the thread has been - * blocked by some kernel function invoked by the work functions - * (e.g., waiting for socket buffer space). - * - * This is not redundant with run_time_before_reschedule_histogram, as - * the latter doesn't count run time not followed by a cond_resched - * call. - */ - uint64_t run_time; - // Time the thread has been suspended via cond_resched(). - // (Duplicates data hidden within reschedule_time_histogram.) - uint64_t reschedule_time; - - // Histogram of the queue times of work items (microseconds) - struct histogram *queue_time_histogram; - // How busy we are when cond_resched is called - struct histogram *reschedule_queue_length_histogram; - // Histogram of the time cond_resched makes us sleep for (microseconds) - struct histogram *reschedule_time_histogram; - // Histogram of the run time between cond_resched calls (microseconds) - struct histogram *run_time_before_reschedule_histogram; - // Histogram of the time schedule_timeout lets us sleep for - // (microseconds) - struct histogram *schedule_time_histogram; - // How long from thread wakeup call to thread actually running - // (microseconds) - struct histogram *wakeup_latency_histogram; - // How much work is pending by the time we start running - struct histogram *wakeup_queue_length_histogram; -}; - -/** - * Initialize the work queue's statistics tracking. - * - * @param stats The statistics structure - * @param queue_kobject The sysfs directory kobject for the work queue - * - * @return 0 or a kernel error code - **/ -int __must_check -initialize_work_queue_stats(struct vdo_work_queue_stats *stats, - struct kobject *queue_kobject); - -/** - * Tear down any allocated storage or objects for statistics tracking. - * - * @param stats The statistics structure - **/ -void cleanup_work_queue_stats(struct vdo_work_queue_stats *stats); - -/** - * Update the work queue statistics tracking to note the enqueueing of - * a work item. - * - * @param stats The statistics structure - * @param item The work item being enqueued - * @param priority The priority of the work item - **/ -static inline void update_stats_for_enqueue(struct vdo_work_queue_stats *stats, - struct vdo_work_item *item, - int priority) -{ - update_vdo_work_item_stats_for_enqueue(&stats->work_item_stats, item, - priority); - item->enqueue_time = ktime_get_ns(); -} - -/** - * Update the work queue statistics tracking to note the dequeueing of - * a work item. - * - * @param stats The statistics structure - * @param item The work item being enqueued - * - * @return the current time when this was called - **/ -static inline uint64_t -update_stats_for_dequeue(struct vdo_work_queue_stats *stats, - struct vdo_work_item *item) -{ - uint64_t dequeue_time = ktime_get_ns(); - uint64_t elapsed = dequeue_time - item->enqueue_time; - enter_histogram_sample(stats->queue_time_histogram, elapsed / 1000); - item->enqueue_time = 0; - return dequeue_time; -} - -/** - * Write the work queue's accumulated statistics to the kernel log. - * - * The queue pointer is needed so that its address and name can be - * logged along with the statistics. - * - * @param queue The work queue - **/ -void log_work_queue_stats(const struct simple_work_queue *queue); - -/** - * Format the thread lifetime, run time, and suspend time into a - * supplied buffer for reporting via sysfs. - * - * @param [in] stats The stats structure containing the run-time info - * @param [out] buffer The buffer in which to report the info - **/ -ssize_t format_run_time_stats(const struct vdo_work_queue_stats *stats, - char *buffer); - -#endif // WORK_QUEUE_STATS_H diff --git a/vdo/workQueueSysfs.c b/vdo/workQueueSysfs.c deleted file mode 100644 index df452038..00000000 --- a/vdo/workQueueSysfs.c +++ /dev/null @@ -1,201 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/kernel/workQueueSysfs.c#5 $ - */ - -#include "workQueueSysfs.h" - -#include - -#include "logger.h" -#include "memoryAlloc.h" - -#include "workQueueInternals.h" - -struct work_queue_attribute { - struct attribute attr; - ssize_t (*show)(const struct vdo_work_queue *queue, char *buf); - ssize_t (*store)(struct vdo_work_queue *queue, - const char *buf, - size_t length); -}; - -/**********************************************************************/ -static ssize_t name_show(const struct vdo_work_queue *queue, char *buf) -{ - return sprintf(buf, "%s\n", queue->name); -} - -/**********************************************************************/ -static ssize_t pid_show(const struct vdo_work_queue *queue, char *buf) -{ - const struct simple_work_queue *simple_queue = - as_const_simple_work_queue(queue); - return sprintf(buf, "%d\n", READ_ONCE(simple_queue->thread_pid)); -} - -/**********************************************************************/ -static ssize_t times_show(const struct vdo_work_queue *queue, char *buf) -{ - return format_run_time_stats(&as_const_simple_work_queue(queue)->stats, - buf); -} - -/**********************************************************************/ -static ssize_t type_show(const struct vdo_work_queue *queue, char *buf) -{ - strcpy(buf, queue->round_robin_mode ? "round-robin\n" : "simple\n"); - return strlen(buf); -} - -/**********************************************************************/ -static ssize_t work_functions_show(const struct vdo_work_queue *queue, - char *buf) -{ - const struct simple_work_queue *simple_queue = - as_const_simple_work_queue(queue); - return format_vdo_work_item_stats(&simple_queue->stats.work_item_stats, - buf, - PAGE_SIZE); -} - -/**********************************************************************/ -static struct work_queue_attribute name_attr = { - .attr = { - .name = "name", - .mode = 0444, - }, - .show = name_show, -}; - -/**********************************************************************/ -static struct work_queue_attribute pid_attr = { - .attr = { - .name = "pid", - .mode = 0444, - }, - .show = pid_show, -}; - -/**********************************************************************/ -static struct work_queue_attribute times_attr = { - .attr = { - .name = "times", - .mode = 0444 - }, - .show = times_show, -}; - -/**********************************************************************/ -static struct work_queue_attribute type_attr = { - .attr = { - .name = "type", - .mode = 0444, - }, - .show = type_show, -}; - -/**********************************************************************/ -static struct work_queue_attribute work_functions_attr = { - .attr = { - .name = "work_functions", - .mode = 0444, - }, - .show = work_functions_show, -}; - -/**********************************************************************/ -static struct attribute *simple_work_queue_attrs[] = { - &name_attr.attr, - &pid_attr.attr, - ×_attr.attr, - &type_attr.attr, - &work_functions_attr.attr, - NULL, -}; - -/**********************************************************************/ -static struct attribute *round_robin_work_queue_attrs[] = { - &name_attr.attr, - &type_attr.attr, - NULL, -}; - -/**********************************************************************/ -static ssize_t work_queue_attr_show(struct kobject *kobj, - struct attribute *attr, char *buf) -{ - struct work_queue_attribute *wq_attr = - container_of(attr, struct work_queue_attribute, attr); - struct vdo_work_queue *queue = container_of(kobj, - struct vdo_work_queue, - kobj); - if (wq_attr->show == NULL) { - return -EINVAL; - } - return wq_attr->show(queue, buf); -} - -/**********************************************************************/ -static ssize_t work_queue_attr_store(struct kobject *kobj, - struct attribute *attr, const char *buf, - size_t length) -{ - struct work_queue_attribute *wq_attr = - container_of(attr, struct work_queue_attribute, attr); - struct vdo_work_queue *queue = - container_of(kobj, struct vdo_work_queue, kobj); - if (wq_attr->store == NULL) { - return -EINVAL; - } - return wq_attr->store(queue, buf, length); -} - -/**********************************************************************/ -static struct sysfs_ops work_queue_sysfs_ops = { - .show = work_queue_attr_show, - .store = work_queue_attr_store, -}; - -/**********************************************************************/ -static void work_queue_release(struct kobject *kobj) -{ - struct vdo_work_queue *queue = - container_of(kobj, struct vdo_work_queue, kobj); - UDS_FREE(queue->name); - if (queue->round_robin_mode) { - UDS_FREE(as_round_robin_work_queue(queue)); - } else { - UDS_FREE(as_simple_work_queue(queue)); - } -} - -/**********************************************************************/ -struct kobj_type simple_work_queue_kobj_type = { - .default_attrs = simple_work_queue_attrs, - .release = work_queue_release, - .sysfs_ops = &work_queue_sysfs_ops, -}; - -/**********************************************************************/ -struct kobj_type round_robin_work_queue_kobj_type = { - .default_attrs = round_robin_work_queue_attrs, - .release = work_queue_release, - .sysfs_ops = &work_queue_sysfs_ops, -}; diff --git a/vdo/workQueueSysfs.h b/vdo/workQueueSysfs.h deleted file mode 100644 index 3dd37150..00000000 --- a/vdo/workQueueSysfs.h +++ /dev/null @@ -1,30 +0,0 @@ -/* - * Copyright Red Hat - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/sulfur/src/c++/vdo/kernel/workQueueSysfs.h#2 $ - */ - -#ifndef WORK_QUEUE_SYSFS_H -#define WORK_QUEUE_SYSFS_H - -#include - -extern struct kobj_type round_robin_work_queue_kobj_type; -extern struct kobj_type simple_work_queue_kobj_type; - -#endif // WORK_QUEUE_SYSFS_H