Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[807] Do not stop initialization if disk is not available #810

Open
wants to merge 6 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@ Bob versions changelog
- Make iouring optional (#567)

#### Fixed
- Do not propagate an initialization error when it is determined that the disk is not available (#807)
- Fix crash in hardware metrics when disk is not available (#812)
- Fix memory leak due to prometheus lib (#788)

#### Updated
Expand Down
9 changes: 5 additions & 4 deletions bob-backend/src/pearl/disk_controller.rs
Original file line number Diff line number Diff line change
Expand Up @@ -305,7 +305,7 @@ impl DiskController {
.cloned()
}

async fn get_or_create_pearl(&self, operation: &Operation) -> BackendResult<Group> {
async fn get_or_create_alien_pearl(&self, operation: &Operation) -> BackendResult<Group> {
trace!("try get alien pearl, operation {:?}", operation);
// block for read lock: it should be dropped before write lock is acquired (otherwise - deadlock)
{
Expand Down Expand Up @@ -361,14 +361,15 @@ impl DiskController {
}

async fn groups_run_initialized(&self, pp: impl Hooks) -> AnyResult<()> {
let res = {
let mut res = {
let _permit = self.run_sem.acquire().await.expect("Semaphore is closed");
Self::run_groups(self.groups.clone(), pp).await
};
if let Err(e) = &res {
error!("Can't run groups on disk {:?} (reason: {})", self.disk, e);
if !Self::is_work_dir_available(self.disk.path()) {
self.change_state(GroupsState::NotReady).await;
res = Ok(());
}
} else {
self.change_state(GroupsState::Ready).await;
Expand All @@ -384,7 +385,7 @@ impl DiskController {
data: &BobData,
) -> Result<(), Error> {
if *self.state.read().await == GroupsState::Ready {
let vdisk_group = self.get_or_create_pearl(&op).await;
let vdisk_group = self.get_or_create_alien_pearl(&op).await;
match vdisk_group {
Ok(group) => match group.put(key, data, StartTimestampConfig::new(false)).await {
Err(e) => Err(self.process_error(e).await),
Expand Down Expand Up @@ -569,7 +570,7 @@ impl DiskController {
}
} else {
// we should create group only when force_delete == true
match self.get_or_create_pearl(&op).await {
match self.get_or_create_alien_pearl(&op).await {
Ok(group) => group,
Err(err) => {
error!(
Expand Down
4 changes: 2 additions & 2 deletions bob-backend/src/pearl/group.rs
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ impl Group {
let config = self.settings.config();
let new_holders = config
.try_multiple_times_async(
|| self.read_vdisk_directory(),
|| self.init_and_read_vdisk_directory(),
"can't create pearl holders",
self.settings.config().fail_retry_timeout(),
)
Expand Down Expand Up @@ -567,7 +567,7 @@ impl Group {
self.create_pearl_holder(start_timestamp, &hash)
}

pub async fn read_vdisk_directory(&self) -> BackendResult<Vec<Holder>> {
pub async fn init_and_read_vdisk_directory(&self) -> BackendResult<Vec<Holder>> {
Utils::check_or_create_directory(&self.directory_path).await?;

let mut holders = vec![];
Expand Down
13 changes: 5 additions & 8 deletions bob-backend/src/pearl/holder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -430,14 +430,11 @@ impl Holder {
}

async fn init_holder(&self) -> AnyResult<Storage<Key>> {
let f = || Utils::check_or_create_directory(&self.inner.disk_path);
self.inner.config
.try_multiple_times_async(
f,
&format!("cannot check path: {:?}", self.inner.disk_path),
self.inner.config.fail_retry_timeout(),
)
.await?;
Utils::check_or_create_directory(&self.inner.disk_path).await
.map_err(|err| {
error!("cannot create holder dir: {:?}. Error: {:?}", self.inner.disk_path, err);
err
} )?;

self.inner.config
.try_multiple_times_async(
Expand Down
4 changes: 1 addition & 3 deletions bob-backend/src/pearl/settings.rs
Original file line number Diff line number Diff line change
Expand Up @@ -158,7 +158,7 @@ impl Settings {
) -> BackendResult<Group> {
let path = self.alien_path(vdisk_id, &remote_node_name);

Utils::check_or_create_directory(&path).await?;
Utils::check_or_create_all_directories(&path, Some(&self.alien_folder)).await?;

let disk_name = self
.config
Expand All @@ -178,8 +178,6 @@ impl Settings {
}

pub async fn get_all_subdirectories(path: &Path) -> BackendResult<Vec<DirEntry>> {
Utils::check_or_create_directory(path).await?;

let mut dir = read_dir(path).await.map_err(|e| {
let msg = format!("couldn't process path: {:?}, error: {:?} ", path, e);
error!("{}", msg);
Expand Down
2 changes: 2 additions & 0 deletions bob-backend/src/pearl/tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@ async fn create_backend(node_config: &str, cluster_config: &str) -> BackendResul
let node = NodeConfig::get_from_string(node_config, &cluster).unwrap();
debug!("node: {:?}", node);

let disk = node.disks().lock().expect("lock error")[0].path().to_owned();
std::fs::create_dir_all(&disk).expect("dir creation error");
let mapper = Arc::new(Virtual::new(&node, &cluster));
debug!("mapper: {:?}", mapper);
PearlBackend::new(mapper, &node).await
Expand Down
77 changes: 63 additions & 14 deletions bob-backend/src/pearl/utils.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,26 +21,75 @@ impl StartTimestampConfig {
}

impl Utils {
/// Creates directory. Parent directory should exist
pub async fn check_or_create_directory(path: &Path) -> BackendResult<()> {
if path.exists() {
trace!("directory: {} exists", path.display());
return Ok(());
}

let dir = path
.to_str()
.ok_or_else(|| Error::storage("invalid some path, check vdisk or disk names"))?;

match tokio::fs::create_dir(&path).await {
Ok(_) => {
info!("dir created: {}", path.display());
Ok(())
},
Err(err) => {
match err.kind() {
IOErrorKind::AlreadyExists => {
trace!("directory: {} exists", path.display());
Ok(())
},
IOErrorKind::NotFound => {
Err(Error::storage(format!("cannot create directory, because part of the path is not exist: '{}', error: {}", dir, err)))
},
IOErrorKind::PermissionDenied | IOErrorKind::Other => {
Err(Error::possible_disk_disconnection())
},
_ => {
Err(Error::storage(format!("cannot create directory: '{}', error: {}", dir, err)))
}
}
}
}
}

/// Creates directory and all its parents. If `base_path` is specified, then it should exist before creation started
pub async fn check_or_create_all_directories(path: &Path, base_path: Option<&Path>) -> BackendResult<()> {
if path.exists() {
trace!("directory: {:?} exists", path);
} else {
let dir = path
.to_str()
.ok_or_else(|| Error::storage("invalid some path, check vdisk or disk names"))?;

create_dir_all(&path)
.await
.map(|_| info!("create directory: {}", dir))
.map_err(|e| match e.kind() {
return Ok(());
}

if let Some(base_path) = base_path {
if !base_path.exists() {
return Err(Error::storage(format!("cannot create directory, because base path is not exist. Dir: '{}', Base path: '{}'", path.display(), base_path.display())));
}
}

let dir = path
.to_str()
.ok_or_else(|| Error::storage("invalid some path, check vdisk or disk names"))?;

match tokio::fs::create_dir_all(&path).await {
Ok(_) => {
info!("dir created: {}", path.display());
Ok(())
},
Err(err) => {
match err.kind() {
IOErrorKind::PermissionDenied | IOErrorKind::Other => {
Error::possible_disk_disconnection()
Err(Error::possible_disk_disconnection())
},
_ => {
Err(Error::storage(format!("cannot create directory: {}, error: {}", dir, err)))
}
_ => Error::storage(format!("cannot create directory: {}, error: {}", dir, e)),
})?;
info!("dir created: {}", path.display());
}
}
}
Ok(())
}

pub async fn drop_pearl_lock_file(path: &Path) -> BackendResult<()> {
Expand Down
23 changes: 17 additions & 6 deletions bob/src/hw_metrics_collector.rs
Original file line number Diff line number Diff line change
Expand Up @@ -37,19 +37,30 @@ impl HWMetricsCollector {
}

fn collect_used_disks(disks: &[DiskPath]) -> HashMap<PathBuf, DiskName> {
System::new_all()
System::new_with_specifics(RefreshKind::new().with_disks_list().with_disks())
.disks()
.iter()
.filter_map(|d| {
let path = d.mount_point();
let path_metadata = match path.metadata() {
Ok(meta) => meta,
Err(err) => {
warn!("Can't get metadata from OS for disk '{}' returned by 'sysinfo': {:?}", path.display(), err);
return None;
}
};
disks
.iter()
.find(move |dp| {
let dp_md = Path::new(dp.path())
.metadata()
.expect("Can't get metadata from OS");
let p_md = path.metadata().expect("Can't get metadata from OS");
p_md.dev() == dp_md.dev()
match Path::new(dp.path()).metadata() {
Ok(dp_metadata) => {
path_metadata.dev() == dp_metadata.dev()
},
Err(err) => {
warn!("Can't get metadata from OS for disk '{}' specified in cluster config: {:?}", dp.path(), err);
false
}
}
})
.map(|config_disk| {
let diskpath = path.to_str().expect("Not UTF-8").to_owned();
Expand Down