From 2a2d4d0f379f9a809a546bcfd77eb89a0fa1aeda Mon Sep 17 00:00:00 2001 From: Sergei Petrunia Date: Tue, 22 Dec 2020 14:46:20 -0800 Subject: [PATCH 01/25] Apply the changes from: PS-5501 : Re-license PerconaFT 'locktree' to Apache V2 (#7801) Summary: commit d5178f513c0b4144a5ac9358ec0f6a3b54a28e76 Author: George O. Lorch III Date: Tue Mar 19 12:18:40 2019 -0700 PS-5501 : Re-license PerconaFT 'locktree' to Apache V2 - Fixed some incomplete relicensed files from previous round. - Added missing license text to some. - Relicensed more files to Apache V2 that locktree depends on. Pull Request resolved: https://github.com/facebook/rocksdb/pull/7801 Reviewed By: jay-zhuang Differential Revision: D25682430 Pulled By: cheng-chang fbshipit-source-id: deb8a0de3e76f3638672997bfbd300e2fffbe5f5 --- .../lock/range/range_tree/lib/ft/comparator.h | 14 ++++++ .../lock/range/range_tree/lib/ft/ft-status.h | 14 ++++++ .../range/range_tree/lib/locktree/keyrange.cc | 1 + .../range/range_tree/lib/locktree/keyrange.h | 1 + .../range_tree/lib/locktree/lock_request.cc | 1 + .../range_tree/lib/locktree/lock_request.h | 1 + .../range/range_tree/lib/locktree/locktree.cc | 1 + .../range/range_tree/lib/locktree/locktree.h | 1 + .../range/range_tree/lib/locktree/manager.cc | 1 + .../range_tree/lib/locktree/range_buffer.cc | 1 + .../range_tree/lib/locktree/range_buffer.h | 1 + .../range/range_tree/lib/locktree/treenode.cc | 1 + .../range/range_tree/lib/locktree/treenode.h | 1 + .../range_tree/lib/locktree/txnid_set.cc | 1 + .../range/range_tree/lib/locktree/txnid_set.h | 1 + .../lock/range/range_tree/lib/locktree/wfg.cc | 1 + .../lock/range/range_tree/lib/locktree/wfg.h | 1 + .../range/range_tree/lib/portability/memory.h | 14 ++++++ .../range_tree/lib/portability/toku_atomic.h | 14 ++++++ .../lib/portability/toku_instrumentation.h | 46 +++++++++++++++++++ .../lib/portability/toku_portability.h | 14 ++++++ .../range_tree/lib/portability/toku_pthread.h | 14 ++++++ .../lib/portability/toku_race_tools.h | 14 ++++++ .../range_tree/lib/portability/toku_time.h | 14 ++++++ .../lock/range/range_tree/lib/util/dbt.h | 14 ++++++ .../range_tree/lib/util/growable_array.h | 1 + .../range/range_tree/lib/util/memarena.cc | 14 ++++++ .../lock/range/range_tree/lib/util/memarena.h | 14 ++++++ .../lock/range/range_tree/lib/util/omt.h | 1 + .../lock/range/range_tree/lib/util/omt_impl.h | 1 + .../range_tree/lib/util/partitioned_counter.h | 14 ++++++ .../lock/range/range_tree/lib/util/status.h | 14 ++++++ 32 files changed, 246 insertions(+) diff --git a/utilities/transactions/lock/range/range_tree/lib/ft/comparator.h b/utilities/transactions/lock/range/range_tree/lib/ft/comparator.h index 6f4a823db..718efc623 100644 --- a/utilities/transactions/lock/range/range_tree/lib/ft/comparator.h +++ b/utilities/transactions/lock/range/range_tree/lib/ft/comparator.h @@ -31,6 +31,20 @@ Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. You should have received a copy of the GNU Affero General Public License along with PerconaFT. If not, see . + +---------------------------------------- + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. ======= */ #ident \ diff --git a/utilities/transactions/lock/range/range_tree/lib/ft/ft-status.h b/utilities/transactions/lock/range/range_tree/lib/ft/ft-status.h index 095c044a0..1b4511172 100644 --- a/utilities/transactions/lock/range/range_tree/lib/ft/ft-status.h +++ b/utilities/transactions/lock/range/range_tree/lib/ft/ft-status.h @@ -32,6 +32,20 @@ Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. You should have received a copy of the GNU Affero General Public License along with PerconaFT. If not, see . + +---------------------------------------- + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. ======= */ #ident \ diff --git a/utilities/transactions/lock/range/range_tree/lib/locktree/keyrange.cc b/utilities/transactions/lock/range/range_tree/lib/locktree/keyrange.cc index c4b2a1258..e50ace5a9 100644 --- a/utilities/transactions/lock/range/range_tree/lib/locktree/keyrange.cc +++ b/utilities/transactions/lock/range/range_tree/lib/locktree/keyrange.cc @@ -47,6 +47,7 @@ Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and + limitations under the License. ======= */ #ident \ diff --git a/utilities/transactions/lock/range/range_tree/lib/locktree/keyrange.h b/utilities/transactions/lock/range/range_tree/lib/locktree/keyrange.h index 6b6c6cb9a..f9aeea0c4 100644 --- a/utilities/transactions/lock/range/range_tree/lib/locktree/keyrange.h +++ b/utilities/transactions/lock/range/range_tree/lib/locktree/keyrange.h @@ -45,6 +45,7 @@ Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and + limitations under the License. ======= */ #ident \ diff --git a/utilities/transactions/lock/range/range_tree/lib/locktree/lock_request.cc b/utilities/transactions/lock/range/range_tree/lib/locktree/lock_request.cc index 9df8f3cb3..fb14f98bd 100644 --- a/utilities/transactions/lock/range/range_tree/lib/locktree/lock_request.cc +++ b/utilities/transactions/lock/range/range_tree/lib/locktree/lock_request.cc @@ -47,6 +47,7 @@ Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and + limitations under the License. ======= */ #ident \ diff --git a/utilities/transactions/lock/range/range_tree/lib/locktree/lock_request.h b/utilities/transactions/lock/range/range_tree/lib/locktree/lock_request.h index 1dce4973f..ac47c3428 100644 --- a/utilities/transactions/lock/range/range_tree/lib/locktree/lock_request.h +++ b/utilities/transactions/lock/range/range_tree/lib/locktree/lock_request.h @@ -45,6 +45,7 @@ Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and + limitations under the License. ======= */ #ident \ diff --git a/utilities/transactions/lock/range/range_tree/lib/locktree/locktree.cc b/utilities/transactions/lock/range/range_tree/lib/locktree/locktree.cc index bcd3a0044..b8482a227 100644 --- a/utilities/transactions/lock/range/range_tree/lib/locktree/locktree.cc +++ b/utilities/transactions/lock/range/range_tree/lib/locktree/locktree.cc @@ -47,6 +47,7 @@ Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and + limitations under the License. ======= */ #ident \ diff --git a/utilities/transactions/lock/range/range_tree/lib/locktree/locktree.h b/utilities/transactions/lock/range/range_tree/lib/locktree/locktree.h index fd8223da4..3e438f502 100644 --- a/utilities/transactions/lock/range/range_tree/lib/locktree/locktree.h +++ b/utilities/transactions/lock/range/range_tree/lib/locktree/locktree.h @@ -45,6 +45,7 @@ Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and + limitations under the License. ======= */ #ident \ diff --git a/utilities/transactions/lock/range/range_tree/lib/locktree/manager.cc b/utilities/transactions/lock/range/range_tree/lib/locktree/manager.cc index 775988d06..4186182be 100644 --- a/utilities/transactions/lock/range/range_tree/lib/locktree/manager.cc +++ b/utilities/transactions/lock/range/range_tree/lib/locktree/manager.cc @@ -47,6 +47,7 @@ Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and + limitations under the License. ======= */ #ident \ diff --git a/utilities/transactions/lock/range/range_tree/lib/locktree/range_buffer.cc b/utilities/transactions/lock/range/range_tree/lib/locktree/range_buffer.cc index 3759d750d..1e1d23ef8 100644 --- a/utilities/transactions/lock/range/range_tree/lib/locktree/range_buffer.cc +++ b/utilities/transactions/lock/range/range_tree/lib/locktree/range_buffer.cc @@ -47,6 +47,7 @@ Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and + limitations under the License. ======= */ #ident \ diff --git a/utilities/transactions/lock/range/range_tree/lib/locktree/range_buffer.h b/utilities/transactions/lock/range/range_tree/lib/locktree/range_buffer.h index 280ee9dd1..76e28d747 100644 --- a/utilities/transactions/lock/range/range_tree/lib/locktree/range_buffer.h +++ b/utilities/transactions/lock/range/range_tree/lib/locktree/range_buffer.h @@ -45,6 +45,7 @@ Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and + limitations under the License. ======= */ #ident \ diff --git a/utilities/transactions/lock/range/range_tree/lib/locktree/treenode.cc b/utilities/transactions/lock/range/range_tree/lib/locktree/treenode.cc index 5a6af141c..8997f634b 100644 --- a/utilities/transactions/lock/range/range_tree/lib/locktree/treenode.cc +++ b/utilities/transactions/lock/range/range_tree/lib/locktree/treenode.cc @@ -47,6 +47,7 @@ Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and + limitations under the License. ======= */ #ident \ diff --git a/utilities/transactions/lock/range/range_tree/lib/locktree/treenode.h b/utilities/transactions/lock/range/range_tree/lib/locktree/treenode.h index f87646b79..ec25a8c58 100644 --- a/utilities/transactions/lock/range/range_tree/lib/locktree/treenode.h +++ b/utilities/transactions/lock/range/range_tree/lib/locktree/treenode.h @@ -45,6 +45,7 @@ Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and + limitations under the License. ======= */ #ident \ diff --git a/utilities/transactions/lock/range/range_tree/lib/locktree/txnid_set.cc b/utilities/transactions/lock/range/range_tree/lib/locktree/txnid_set.cc index 1e22f716b..4caf1e26f 100644 --- a/utilities/transactions/lock/range/range_tree/lib/locktree/txnid_set.cc +++ b/utilities/transactions/lock/range/range_tree/lib/locktree/txnid_set.cc @@ -47,6 +47,7 @@ Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and + limitations under the License. ======= */ #ident \ diff --git a/utilities/transactions/lock/range/range_tree/lib/locktree/txnid_set.h b/utilities/transactions/lock/range/range_tree/lib/locktree/txnid_set.h index 66e799e71..d79c24fb0 100644 --- a/utilities/transactions/lock/range/range_tree/lib/locktree/txnid_set.h +++ b/utilities/transactions/lock/range/range_tree/lib/locktree/txnid_set.h @@ -45,6 +45,7 @@ Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and + limitations under the License. ======= */ #ident \ diff --git a/utilities/transactions/lock/range/range_tree/lib/locktree/wfg.cc b/utilities/transactions/lock/range/range_tree/lib/locktree/wfg.cc index 5dfc94ad8..24536c88e 100644 --- a/utilities/transactions/lock/range/range_tree/lib/locktree/wfg.cc +++ b/utilities/transactions/lock/range/range_tree/lib/locktree/wfg.cc @@ -47,6 +47,7 @@ Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and + limitations under the License. ======= */ #ident \ diff --git a/utilities/transactions/lock/range/range_tree/lib/locktree/wfg.h b/utilities/transactions/lock/range/range_tree/lib/locktree/wfg.h index 8354e34a0..804202170 100644 --- a/utilities/transactions/lock/range/range_tree/lib/locktree/wfg.h +++ b/utilities/transactions/lock/range/range_tree/lib/locktree/wfg.h @@ -45,6 +45,7 @@ Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and + limitations under the License. ======= */ #ident \ diff --git a/utilities/transactions/lock/range/range_tree/lib/portability/memory.h b/utilities/transactions/lock/range/range_tree/lib/portability/memory.h index 4b340f6a8..0a621f8e0 100644 --- a/utilities/transactions/lock/range/range_tree/lib/portability/memory.h +++ b/utilities/transactions/lock/range/range_tree/lib/portability/memory.h @@ -32,6 +32,20 @@ Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. You should have received a copy of the GNU Affero General Public License along with PerconaFT. If not, see . + +---------------------------------------- + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. ======= */ #ident \ diff --git a/utilities/transactions/lock/range/range_tree/lib/portability/toku_atomic.h b/utilities/transactions/lock/range/range_tree/lib/portability/toku_atomic.h index c8f08eab7..aaa2298fa 100644 --- a/utilities/transactions/lock/range/range_tree/lib/portability/toku_atomic.h +++ b/utilities/transactions/lock/range/range_tree/lib/portability/toku_atomic.h @@ -32,6 +32,20 @@ Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. You should have received a copy of the GNU Affero General Public License along with PerconaFT. If not, see . + +---------------------------------------- + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. ======= */ #ident \ diff --git a/utilities/transactions/lock/range/range_tree/lib/portability/toku_instrumentation.h b/utilities/transactions/lock/range/range_tree/lib/portability/toku_instrumentation.h index 1b85e2de1..c967e7177 100644 --- a/utilities/transactions/lock/range/range_tree/lib/portability/toku_instrumentation.h +++ b/utilities/transactions/lock/range/range_tree/lib/portability/toku_instrumentation.h @@ -1,3 +1,49 @@ +/*====== +This file is part of PerconaFT. + +Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License, version 2, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with PerconaFT. If not, see . + +---------------------------------------- + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License, version 3, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with PerconaFT. If not, see . + +---------------------------------------- + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +======= */ + #pragma once #include // FILE diff --git a/utilities/transactions/lock/range/range_tree/lib/portability/toku_portability.h b/utilities/transactions/lock/range/range_tree/lib/portability/toku_portability.h index cb07e6516..9a95b38bd 100644 --- a/utilities/transactions/lock/range/range_tree/lib/portability/toku_portability.h +++ b/utilities/transactions/lock/range/range_tree/lib/portability/toku_portability.h @@ -32,6 +32,20 @@ Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. You should have received a copy of the GNU Affero General Public License along with PerconaFT. If not, see . + +---------------------------------------- + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. ======= */ #ident \ diff --git a/utilities/transactions/lock/range/range_tree/lib/portability/toku_pthread.h b/utilities/transactions/lock/range/range_tree/lib/portability/toku_pthread.h index 0bd0a084a..bd1cc8e6c 100644 --- a/utilities/transactions/lock/range/range_tree/lib/portability/toku_pthread.h +++ b/utilities/transactions/lock/range/range_tree/lib/portability/toku_pthread.h @@ -32,6 +32,20 @@ Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. You should have received a copy of the GNU Affero General Public License along with PerconaFT. If not, see . + +---------------------------------------- + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. ======= */ #ident \ diff --git a/utilities/transactions/lock/range/range_tree/lib/portability/toku_race_tools.h b/utilities/transactions/lock/range/range_tree/lib/portability/toku_race_tools.h index 7ac9c00f2..3cb5b5790 100644 --- a/utilities/transactions/lock/range/range_tree/lib/portability/toku_race_tools.h +++ b/utilities/transactions/lock/range/range_tree/lib/portability/toku_race_tools.h @@ -32,6 +32,20 @@ Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. You should have received a copy of the GNU Affero General Public License along with PerconaFT. If not, see . + +---------------------------------------- + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. ======= */ #ident \ diff --git a/utilities/transactions/lock/range/range_tree/lib/portability/toku_time.h b/utilities/transactions/lock/range/range_tree/lib/portability/toku_time.h index 9e1dce27a..4425a4a2e 100644 --- a/utilities/transactions/lock/range/range_tree/lib/portability/toku_time.h +++ b/utilities/transactions/lock/range/range_tree/lib/portability/toku_time.h @@ -32,6 +32,20 @@ Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. You should have received a copy of the GNU Affero General Public License along with PerconaFT. If not, see . + +---------------------------------------- + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. ======= */ #ident \ diff --git a/utilities/transactions/lock/range/range_tree/lib/util/dbt.h b/utilities/transactions/lock/range/range_tree/lib/util/dbt.h index 963ad7ccf..d86c440f8 100644 --- a/utilities/transactions/lock/range/range_tree/lib/util/dbt.h +++ b/utilities/transactions/lock/range/range_tree/lib/util/dbt.h @@ -32,6 +32,20 @@ Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. You should have received a copy of the GNU Affero General Public License along with PerconaFT. If not, see . + +---------------------------------------- + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. ======= */ #ident \ diff --git a/utilities/transactions/lock/range/range_tree/lib/util/growable_array.h b/utilities/transactions/lock/range/range_tree/lib/util/growable_array.h index ddd6a19a3..158750fdb 100644 --- a/utilities/transactions/lock/range/range_tree/lib/util/growable_array.h +++ b/utilities/transactions/lock/range/range_tree/lib/util/growable_array.h @@ -45,6 +45,7 @@ Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and + limitations under the License. ======= */ #ident \ diff --git a/utilities/transactions/lock/range/range_tree/lib/util/memarena.cc b/utilities/transactions/lock/range/range_tree/lib/util/memarena.cc index 9a68f76c0..0e7a9880b 100644 --- a/utilities/transactions/lock/range/range_tree/lib/util/memarena.cc +++ b/utilities/transactions/lock/range/range_tree/lib/util/memarena.cc @@ -34,6 +34,20 @@ Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. You should have received a copy of the GNU Affero General Public License along with PerconaFT. If not, see . + +---------------------------------------- + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. ======= */ #ident \ diff --git a/utilities/transactions/lock/range/range_tree/lib/util/memarena.h b/utilities/transactions/lock/range/range_tree/lib/util/memarena.h index 186c51995..ddcc1144f 100644 --- a/utilities/transactions/lock/range/range_tree/lib/util/memarena.h +++ b/utilities/transactions/lock/range/range_tree/lib/util/memarena.h @@ -32,6 +32,20 @@ Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. You should have received a copy of the GNU Affero General Public License along with PerconaFT. If not, see . + +---------------------------------------- + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. ======= */ #ident \ diff --git a/utilities/transactions/lock/range/range_tree/lib/util/omt.h b/utilities/transactions/lock/range/range_tree/lib/util/omt.h index 63a3dd605..f208002d3 100644 --- a/utilities/transactions/lock/range/range_tree/lib/util/omt.h +++ b/utilities/transactions/lock/range/range_tree/lib/util/omt.h @@ -45,6 +45,7 @@ Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and + limitations under the License. ======= */ #ident \ diff --git a/utilities/transactions/lock/range/range_tree/lib/util/omt_impl.h b/utilities/transactions/lock/range/range_tree/lib/util/omt_impl.h index 86b681d6b..e77986716 100644 --- a/utilities/transactions/lock/range/range_tree/lib/util/omt_impl.h +++ b/utilities/transactions/lock/range/range_tree/lib/util/omt_impl.h @@ -45,6 +45,7 @@ Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and + limitations under the License. ======= */ #ident \ diff --git a/utilities/transactions/lock/range/range_tree/lib/util/partitioned_counter.h b/utilities/transactions/lock/range/range_tree/lib/util/partitioned_counter.h index 08fdda80d..f20eeedf2 100644 --- a/utilities/transactions/lock/range/range_tree/lib/util/partitioned_counter.h +++ b/utilities/transactions/lock/range/range_tree/lib/util/partitioned_counter.h @@ -32,6 +32,20 @@ Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. You should have received a copy of the GNU Affero General Public License along with PerconaFT. If not, see . + +---------------------------------------- + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. ======= */ #ident \ diff --git a/utilities/transactions/lock/range/range_tree/lib/util/status.h b/utilities/transactions/lock/range/range_tree/lib/util/status.h index 996cefb50..3fd0095d0 100644 --- a/utilities/transactions/lock/range/range_tree/lib/util/status.h +++ b/utilities/transactions/lock/range/range_tree/lib/util/status.h @@ -32,6 +32,20 @@ Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. You should have received a copy of the GNU Affero General Public License along with PerconaFT. If not, see . + +---------------------------------------- + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. ======= */ #ident \ From 2eaad9e1b1d3956e721bb76a7407b7853ef16d03 Mon Sep 17 00:00:00 2001 From: cheng-chang <57096775+cheng-chang@users.noreply.github.com> Date: Wed, 23 Dec 2020 11:32:10 -0800 Subject: [PATCH 02/25] Skip WALs according to MinLogNumberToKeep when creating checkpoint (#7789) Summary: In a stress test failure, we observe that a WAL is skipped when creating checkpoint, although its log number >= MinLogNumberToKeep(). This might happen in the following case: 1. when creating the checkpoint, there are 2 column families: CF0 and CF1, and there are 2 WALs: 1, 2; 2. CF0's log number is 1, CF0's active memtable is empty, CF1's log number is 2, CF1's active memtable is not empty, WAL 2 is not empty, the sequence number points to WAL 2; 2. the checkpoint process flushes CF0, since CF0' active memtable is empty, there is no need to SwitchMemtable, thus no new WAL will be created, so CF0's log number is now 2, concurrently, some data is written to CF0 and WAL 2; 3. the checkpoint process flushes CF1, WAL 3 is created and CF1's log number is now 3, CF0's log number is still 2 because CF0 is not empty and WAL 2 contains its unflushed data concurrently written in step 2; 4. the checkpoint process determines that WAL 1 and 2 are no longer needed according to [live_wal_files[i]->StartSequence() >= *sequence_number](https://github.com/facebook/rocksdb/blob/master/utilities/checkpoint/checkpoint_impl.cc#L388), so it skips linking them to the checkpoint directory; 5. but according to `MinLogNumberToKeep()`, WAL 2 still needs to be kept because CF0's log number is 2. If the checkpoint is reopened in read-only mode, and only read from the snapshot with the initial sequence number, then there will be no data loss or data inconsistency. But if the checkpoint is reopened and read from the most recent sequence number, suppose in step 3, there are also data concurrently written to CF1 and WAL 3, then the most recent sequence number refers to the latest entry in WAL 3, so the data written in step 2 should also be visible, but since WAL 2 is discarded, those data are lost. When tracking WAL in MANIFEST is enabled, when reopening the checkpoint, since WAL 2 is still tracked in MANIFEST as alive, but it's missing from the checkpoint directory, a corruption will be reported. This PR makes the checkpoint process to only skip a WAL if its log number < `MinLogNumberToKeep`. Pull Request resolved: https://github.com/facebook/rocksdb/pull/7789 Test Plan: watch existing tests to pass. Reviewed By: ajkr Differential Revision: D25662346 Pulled By: cheng-chang fbshipit-source-id: 136471095baa01886cf44809455cf855f24857a0 --- utilities/backupable/backupable_db_test.cc | 10 ++++++++++ utilities/checkpoint/checkpoint_impl.cc | 12 ++++-------- 2 files changed, 14 insertions(+), 8 deletions(-) diff --git a/utilities/backupable/backupable_db_test.cc b/utilities/backupable/backupable_db_test.cc index 8d8ca95f8..1503a8c84 100644 --- a/utilities/backupable/backupable_db_test.cc +++ b/utilities/backupable/backupable_db_test.cc @@ -70,6 +70,16 @@ class DummyDB : public StackableDB { DBOptions GetDBOptions() const override { return DBOptions(options_); } + using StackableDB::GetIntProperty; + bool GetIntProperty(ColumnFamilyHandle*, const Slice& property, + uint64_t* value) override { + if (property == DB::Properties::kMinLogNumberToKeep) { + *value = 1; + return true; + } + return false; + } + Status EnableFileDeletions(bool /*force*/) override { EXPECT_TRUE(!deletions_enabled_); deletions_enabled_ = true; diff --git a/utilities/checkpoint/checkpoint_impl.cc b/utilities/checkpoint/checkpoint_impl.cc index 2caea3037..d496da57c 100644 --- a/utilities/checkpoint/checkpoint_impl.cc +++ b/utilities/checkpoint/checkpoint_impl.cc @@ -232,14 +232,11 @@ Status CheckpointImpl::CreateCustomCheckpoint( // this will return live_files prefixed with "/" s = db_->GetLiveFiles(live_files, &manifest_file_size, flush_memtable); + if (!db_->GetIntProperty(DB::Properties::kMinLogNumberToKeep, &min_log_num)) { + return Status::InvalidArgument("cannot get the min log number to keep."); + } + if (s.ok() && db_options.allow_2pc) { - // If 2PC is enabled, we need to get minimum log number after the flush. - // Need to refetch the live files to recapture the snapshot. - if (!db_->GetIntProperty(DB::Properties::kMinLogNumberToKeep, - &min_log_num)) { - return Status::InvalidArgument( - "2PC enabled but cannot fine the min log number to keep."); - } // We need to refetch live files with flush to handle this case: // A previous 000001.log contains the prepare record of transaction tnx1. // The current log file is 000002.log, and sequence_number points to this @@ -385,7 +382,6 @@ Status CheckpointImpl::CreateCustomCheckpoint( for (size_t i = 0; s.ok() && i < wal_size; ++i) { if ((live_wal_files[i]->Type() == kAliveLogFile) && (!flush_memtable || - live_wal_files[i]->StartSequence() >= *sequence_number || live_wal_files[i]->LogNumber() >= min_log_num)) { if (i + 1 == wal_size) { s = copy_file_cb(db_options.wal_dir, live_wal_files[i]->PathName(), From 79a21d67cbcfff5f5cded5840751db2a775ac3a4 Mon Sep 17 00:00:00 2001 From: Adam Retter Date: Wed, 30 Dec 2020 13:38:43 -0800 Subject: [PATCH 03/25] Attempt to fix build errors around missing compression library includes (#7803) Summary: This fixes an issue introduced in https://github.com/facebook/rocksdb/pull/7769 that caused many errors about missing compression libraries to be displayed during compilation, although compilation actually succeeded. This PR fixes the compilation so the compression libraries are only introduced where strictly needed. It likely needs to be merged into the same branches as https://github.com/facebook/rocksdb/pull/7769 which I think are: 1. master 2. 6.15.fb 3. 6.16.fb Pull Request resolved: https://github.com/facebook/rocksdb/pull/7803 Reviewed By: ramvadiv Differential Revision: D25733743 Pulled By: pdillinger fbshipit-source-id: 6c04f6864b2ff4a345841d791a89b19e0e3f5bf7 --- Makefile | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/Makefile b/Makefile index 0398ccf67..7c865532c 100644 --- a/Makefile +++ b/Makefile @@ -2206,9 +2206,10 @@ endif JAVA_STATIC_FLAGS = -DZLIB -DBZIP2 -DSNAPPY -DLZ4 -DZSTD JAVA_STATIC_INCLUDES = -I./zlib-$(ZLIB_VER) -I./bzip2-$(BZIP2_VER) -I./snappy-$(SNAPPY_VER) -I./snappy-$(SNAPPY_VER)/build -I./lz4-$(LZ4_VER)/lib -I./zstd-$(ZSTD_VER)/lib -I./zstd-$(ZSTD_VER)/lib/dictBuilder -ifneq ($(findstring rocksdbjavastatic, $(MAKECMDGOALS)),) + +ifneq ($(findstring rocksdbjavastatic, $(filter-out rocksdbjavastatic_deps, $(MAKECMDGOALS))),) CXXFLAGS += $(JAVA_STATIC_FLAGS) $(JAVA_STATIC_INCLUDES) -CFLAGS += $(JAVA_STATIC_FLAGS) $(JAVA_STATIC_INCLUDES) +CFLAGS += $(JAVA_STATIC_FLAGS) $(JAVA_STATIC_INCLUDES) endif rocksdbjavastatic: ifeq ($(JAVA_HOME),) @@ -2216,8 +2217,11 @@ ifeq ($(JAVA_HOME),) endif $(MAKE) rocksdbjavastatic_deps $(MAKE) rocksdbjavastatic_libobjects - cd java;$(MAKE) javalib; - rm -f ./java/target/$(ROCKSDBJNILIB) + $(MAKE) rocksdbjavastatic_javalib + +rocksdbjavastatic_javalib: + cd java;$(MAKE) javalib + rm -f java/target/$(ROCKSDBJNILIB) $(CXX) $(CXXFLAGS) -I./java/. $(JAVA_INCLUDE) -shared -fPIC \ -o ./java/target/$(ROCKSDBJNILIB) $(JNI_NATIVE_SOURCES) \ $(LIB_OBJECTS) $(COVERAGEFLAGS) \ @@ -2440,6 +2444,8 @@ ifneq ($(MAKECMDGOALS),clean) ifneq ($(MAKECMDGOALS),format) ifneq ($(MAKECMDGOALS),jclean) ifneq ($(MAKECMDGOALS),jtest) +ifneq ($(MAKECMDGOALS),rocksdbjavastatic) +ifneq ($(MAKECMDGOALS),rocksdbjavastatic_deps) ifneq ($(MAKECMDGOALS),package) ifneq ($(MAKECMDGOALS),analyze) -include $(DEPFILES) @@ -2449,3 +2455,5 @@ endif endif endif endif +endif +endif From 2b64cddf993a2279079d43f1bbf77305f4ac064c Mon Sep 17 00:00:00 2001 From: Zhichao Cao Date: Thu, 7 Jan 2021 16:30:06 -0800 Subject: [PATCH 04/25] Treat File Scope Write IO Error the same as Retryable IO Error (#7840) Summary: In RocksDB, when IO error happens, the flags of IOStatus can be set. If the IOStatus is set as "File Scope IO Error", it indicate that the error is constrained in the file level. Since RocksDB does not continues write data to a file when any IO Error happens, File Scope IO Error can be treated the same as Retryable IO Error. Adding the logic to ErrorHandler::SetBGError to include the file scope IO Error in its error handling logic, which is the same as retryable IO Error. Pull Request resolved: https://github.com/facebook/rocksdb/pull/7840 Test Plan: added new unit tests in error_handler_fs_test. make check Reviewed By: anand1976 Differential Revision: D25820481 Pulled By: zhichao-cao fbshipit-source-id: 69cabd3d010073e064d6142ce1cabf341b8a6806 --- HISTORY.md | 2 + db/error_handler.cc | 36 +++++--- db/error_handler_fs_test.cc | 178 ++++++++++++++++++++++++++++++++++++ 3 files changed, 202 insertions(+), 14 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index 543ad6048..d6e8f922c 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -1,7 +1,9 @@ # Rocksdb Change Log + ## 6.16.0 (12/18/2020) ### Behavior Changes * Attempting to write a merge operand without explicitly configuring `merge_operator` now fails immediately, causing the DB to enter read-only mode. Previously, failure was deferred until the `merge_operator` was needed by a user read or a background operation. +* Since RocksDB does not continue write the same file if a file write fails for any reason, the file scope write IO error is treated the same as retryable IO error. More information about error handling of file scope IO error is included in `ErrorHandler::SetBGError`. ### Bug Fixes * Truncated WALs ending in incomplete records can no longer produce gaps in the recovered data when `WALRecoveryMode::kPointInTimeRecovery` is used. Gaps are still possible when WALs are truncated exactly on record boundaries; for complete protection, users should enable `track_and_verify_wals_in_manifest`. diff --git a/db/error_handler.cc b/db/error_handler.cc index f121519f4..cc313f0a9 100644 --- a/db/error_handler.cc +++ b/db/error_handler.cc @@ -350,12 +350,17 @@ const Status& ErrorHandler::SetBGError(const Status& bg_err, // This is the main function for looking at IO related error during the // background operations. The main logic is: +// 1) File scope IO error is treated as retryable IO error in the write +// path. In RocksDB, If a file has write IO error and it is at file scope, +// RocksDB never write to the same file again. RocksDB will create a new +// file and rewrite the whole content. Thus, it is retryable. // 1) if the error is caused by data loss, the error is mapped to // unrecoverable error. Application/user must take action to handle -// this situation. -// 2) if the error is a Retryable IO error, auto resume will be called and the -// auto resume can be controlled by resume count and resume interval -// options. There are three sub-cases: +// this situation (File scope case is excluded). +// 2) if the error is a Retryable IO error (i.e., it is a file scope IO error, +// or its retryable flag is set and not a data loss error), auto resume +// will be called and the auto resume can be controlled by resume count +// and resume interval options. There are three sub-cases: // a) if the error happens during compaction, it is mapped to a soft error. // the compaction thread will reschedule a new compaction. // b) if the error happens during flush and also WAL is empty, it is mapped @@ -384,9 +389,10 @@ const Status& ErrorHandler::SetBGError(const IOStatus& bg_io_err, Status new_bg_io_err = bg_io_err; DBRecoverContext context; - if (bg_io_err.GetDataLoss()) { - // First, data loss is treated as unrecoverable error. So it can directly - // overwrite any existing bg_error_. + if (bg_io_err.GetScope() != IOStatus::IOErrorScope::kIOErrorScopeFile && + bg_io_err.GetDataLoss()) { + // First, data loss (non file scope) is treated as unrecoverable error. So + // it can directly overwrite any existing bg_error_. bool auto_recovery = false; Status bg_err(new_bg_io_err, Status::Severity::kUnrecoverableError); bg_error_ = bg_err; @@ -397,13 +403,15 @@ const Status& ErrorHandler::SetBGError(const IOStatus& bg_io_err, &bg_err, db_mutex_, &auto_recovery); recover_context_ = context; return bg_error_; - } else if (bg_io_err.GetRetryable()) { - // Second, check if the error is a retryable IO error or not. if it is - // retryable error and its severity is higher than bg_error_, overwrite - // the bg_error_ with new error. - // In current stage, for retryable IO error of compaction, treat it as - // soft error. In other cases, treat the retryable IO error as hard - // error. + } else if (bg_io_err.GetScope() == + IOStatus::IOErrorScope::kIOErrorScopeFile || + bg_io_err.GetRetryable()) { + // Second, check if the error is a retryable IO error (file scope IO error + // is also treated as retryable IO error in RocksDB write path). if it is + // retryable error and its severity is higher than bg_error_, overwrite the + // bg_error_ with new error. In current stage, for retryable IO error of + // compaction, treat it as soft error. In other cases, treat the retryable + // IO error as hard error. bool auto_recovery = false; EventHelpers::NotifyOnBackgroundError(db_options_.listeners, reason, &new_bg_io_err, db_mutex_, diff --git a/db/error_handler_fs_test.cc b/db/error_handler_fs_test.cc index c17cac290..29345505d 100644 --- a/db/error_handler_fs_test.cc +++ b/db/error_handler_fs_test.cc @@ -241,6 +241,90 @@ TEST_F(DBErrorHandlingFSTest, FLushWritRetryableError) { Destroy(options); } +TEST_F(DBErrorHandlingFSTest, FLushWritFileScopeError) { + std::shared_ptr listener( + new ErrorHandlerFSListener()); + Options options = GetDefaultOptions(); + options.env = fault_env_.get(); + options.create_if_missing = true; + options.listeners.emplace_back(listener); + options.max_bgerror_resume_count = 0; + Status s; + + listener->EnableAutoRecovery(false); + DestroyAndReopen(options); + + IOStatus error_msg = IOStatus::IOError("File Scope Data Loss Error"); + error_msg.SetDataLoss(true); + error_msg.SetScope( + ROCKSDB_NAMESPACE::IOStatus::IOErrorScope::kIOErrorScopeFile); + error_msg.SetRetryable(false); + + ASSERT_OK(Put(Key(1), "val1")); + SyncPoint::GetInstance()->SetCallBack( + "BuildTable:BeforeFinishBuildTable", + [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); }); + SyncPoint::GetInstance()->EnableProcessing(); + s = Flush(); + ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError); + SyncPoint::GetInstance()->DisableProcessing(); + fault_fs_->SetFilesystemActive(true); + s = dbfull()->Resume(); + ASSERT_OK(s); + Reopen(options); + ASSERT_EQ("val1", Get(Key(1))); + + ASSERT_OK(Put(Key(2), "val2")); + SyncPoint::GetInstance()->SetCallBack( + "BuildTable:BeforeSyncTable", + [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); }); + SyncPoint::GetInstance()->EnableProcessing(); + s = Flush(); + ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError); + SyncPoint::GetInstance()->DisableProcessing(); + fault_fs_->SetFilesystemActive(true); + s = dbfull()->Resume(); + ASSERT_OK(s); + Reopen(options); + ASSERT_EQ("val2", Get(Key(2))); + + ASSERT_OK(Put(Key(3), "val3")); + SyncPoint::GetInstance()->SetCallBack( + "BuildTable:BeforeCloseTableFile", + [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); }); + SyncPoint::GetInstance()->EnableProcessing(); + s = Flush(); + ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError); + SyncPoint::GetInstance()->DisableProcessing(); + fault_fs_->SetFilesystemActive(true); + s = dbfull()->Resume(); + ASSERT_OK(s); + Reopen(options); + ASSERT_EQ("val3", Get(Key(3))); + + // not file scope, but retyrable set + error_msg.SetDataLoss(false); + error_msg.SetScope( + ROCKSDB_NAMESPACE::IOStatus::IOErrorScope::kIOErrorScopeFileSystem); + error_msg.SetRetryable(true); + + ASSERT_OK(Put(Key(3), "val3")); + SyncPoint::GetInstance()->SetCallBack( + "BuildTable:BeforeCloseTableFile", + [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); }); + SyncPoint::GetInstance()->EnableProcessing(); + s = Flush(); + ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError); + SyncPoint::GetInstance()->DisableProcessing(); + fault_fs_->SetFilesystemActive(true); + s = dbfull()->Resume(); + ASSERT_OK(s); + Reopen(options); + ASSERT_EQ("val3", Get(Key(3))); + + Destroy(options); +} + TEST_F(DBErrorHandlingFSTest, FLushWritNoWALRetryableError1) { std::shared_ptr listener( new ErrorHandlerFSListener()); @@ -453,6 +537,52 @@ TEST_F(DBErrorHandlingFSTest, ManifestWriteRetryableError) { Close(); } +TEST_F(DBErrorHandlingFSTest, ManifestWriteFileScopeError) { + std::shared_ptr listener( + new ErrorHandlerFSListener()); + Options options = GetDefaultOptions(); + options.env = fault_env_.get(); + options.create_if_missing = true; + options.listeners.emplace_back(listener); + options.max_bgerror_resume_count = 0; + Status s; + std::string old_manifest; + std::string new_manifest; + + listener->EnableAutoRecovery(false); + DestroyAndReopen(options); + old_manifest = GetManifestNameFromLiveFiles(); + + IOStatus error_msg = IOStatus::IOError("File Scope Data Loss Error"); + error_msg.SetDataLoss(true); + error_msg.SetScope( + ROCKSDB_NAMESPACE::IOStatus::IOErrorScope::kIOErrorScopeFile); + error_msg.SetRetryable(false); + + ASSERT_OK(Put(Key(0), "val")); + ASSERT_OK(Flush()); + ASSERT_OK(Put(Key(1), "val")); + SyncPoint::GetInstance()->SetCallBack( + "VersionSet::LogAndApply:WriteManifest", + [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); }); + SyncPoint::GetInstance()->EnableProcessing(); + s = Flush(); + ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError); + SyncPoint::GetInstance()->ClearAllCallBacks(); + SyncPoint::GetInstance()->DisableProcessing(); + fault_fs_->SetFilesystemActive(true); + s = dbfull()->Resume(); + ASSERT_OK(s); + + new_manifest = GetManifestNameFromLiveFiles(); + ASSERT_NE(new_manifest, old_manifest); + + Reopen(options); + ASSERT_EQ("val", Get(Key(0))); + ASSERT_EQ("val", Get(Key(1))); + Close(); +} + TEST_F(DBErrorHandlingFSTest, ManifestWriteNoWALRetryableError) { std::shared_ptr listener( new ErrorHandlerFSListener()); @@ -779,6 +909,54 @@ TEST_F(DBErrorHandlingFSTest, CompactionWriteRetryableError) { Destroy(options); } +TEST_F(DBErrorHandlingFSTest, CompactionWriteFileScopeError) { + std::shared_ptr listener( + new ErrorHandlerFSListener()); + Options options = GetDefaultOptions(); + options.env = fault_env_.get(); + options.create_if_missing = true; + options.level0_file_num_compaction_trigger = 2; + options.listeners.emplace_back(listener); + options.max_bgerror_resume_count = 0; + Status s; + DestroyAndReopen(options); + + IOStatus error_msg = IOStatus::IOError("File Scope Data Loss Error"); + error_msg.SetDataLoss(true); + error_msg.SetScope( + ROCKSDB_NAMESPACE::IOStatus::IOErrorScope::kIOErrorScopeFile); + error_msg.SetRetryable(false); + + ASSERT_OK(Put(Key(0), "va;")); + ASSERT_OK(Put(Key(2), "va;")); + s = Flush(); + ASSERT_OK(s); + + listener->OverrideBGError(Status(error_msg, Status::Severity::kHardError)); + listener->EnableAutoRecovery(false); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::FlushMemTable:FlushMemTableFinished", + "BackgroundCallCompaction:0"}}); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "CompactionJob::OpenCompactionOutputFile", + [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + ASSERT_OK(Put(Key(1), "val")); + s = Flush(); + ASSERT_OK(s); + + s = dbfull()->TEST_WaitForCompact(); + ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError); + + fault_fs_->SetFilesystemActive(true); + SyncPoint::GetInstance()->ClearAllCallBacks(); + SyncPoint::GetInstance()->DisableProcessing(); + s = dbfull()->Resume(); + ASSERT_OK(s); + Destroy(options); +} + TEST_F(DBErrorHandlingFSTest, CorruptionError) { Options options = GetDefaultOptions(); options.env = fault_env_.get(); From eeea27a0481658419e99433abf40fbe598d44d7a Mon Sep 17 00:00:00 2001 From: Cheng Chang Date: Thu, 7 Jan 2021 23:00:12 -0800 Subject: [PATCH 05/25] Get manifest size again after getting min_log_num during checkpoint (#7836) Summary: Currently, manifest size is determined before getting min_log_num. But between getting manifest size and getting min_log_num, concurrently, a flush might succeed, which will write new records to manifest to make some WALs become outdated, then min_log_num will be correspondingly increased, but the new records in manifest will not be copied into the checkpoint because the manifest's size is determined before them, then the newly outdated WALs will still exist in the checkpoint's manifest, but they are not linked/copied to the checkpoint because their log number is < min_log_num, so a corruption of missing WAL will be reported when restoring from the checkpoint. Pull Request resolved: https://github.com/facebook/rocksdb/pull/7836 Test Plan: make crash_test Reviewed By: ajkr Differential Revision: D25788204 Pulled By: cheng-chang fbshipit-source-id: a4e5acf30f08270b3c0a95304ff559a9e655252f --- utilities/checkpoint/checkpoint_impl.cc | 32 ++++++++++--------------- 1 file changed, 12 insertions(+), 20 deletions(-) diff --git a/utilities/checkpoint/checkpoint_impl.cc b/utilities/checkpoint/checkpoint_impl.cc index d496da57c..b533c1d3d 100644 --- a/utilities/checkpoint/checkpoint_impl.cc +++ b/utilities/checkpoint/checkpoint_impl.cc @@ -235,26 +235,18 @@ Status CheckpointImpl::CreateCustomCheckpoint( if (!db_->GetIntProperty(DB::Properties::kMinLogNumberToKeep, &min_log_num)) { return Status::InvalidArgument("cannot get the min log number to keep."); } - - if (s.ok() && db_options.allow_2pc) { - // We need to refetch live files with flush to handle this case: - // A previous 000001.log contains the prepare record of transaction tnx1. - // The current log file is 000002.log, and sequence_number points to this - // file. - // After calling GetLiveFiles(), 000003.log is created. - // Then tnx1 is committed. The commit record is written to 000003.log. - // Now we fetch min_log_num, which will be 3. - // Then only 000002.log and 000003.log will be copied, and 000001.log will - // be skipped. 000003.log contains commit message of tnx1, but we don't - // have respective prepare record for it. - // In order to avoid this situation, we need to force flush to make sure - // all transactions committed before getting min_log_num will be flushed - // to SST files. - // We cannot get min_log_num before calling the GetLiveFiles() for the - // first time, because if we do that, all the logs files will be included, - // far more than needed. - s = db_->GetLiveFiles(live_files, &manifest_file_size, flush_memtable); - } + // Between GetLiveFiles and getting min_log_num, flush might happen + // concurrently, so new WAL deletions might be tracked in MANIFEST. If we do + // not get the new MANIFEST size, the deleted WALs might not be reflected in + // the checkpoint's MANIFEST. + // + // If we get min_log_num before the above GetLiveFiles, then there might + // be too many unnecessary WALs to be included in the checkpoint. + // + // Ideally, min_log_num should be got together with manifest_file_size in + // GetLiveFiles atomically. But that needs changes to GetLiveFiles' signature + // which is a public API. + s = db_->GetLiveFiles(live_files, &manifest_file_size, flush_memtable); TEST_SYNC_POINT("CheckpointImpl::CreateCheckpoint:SavedLiveFiles1"); TEST_SYNC_POINT("CheckpointImpl::CreateCheckpoint:SavedLiveFiles2"); From dff9219df6765e0357e5f1aaab61fcee088a47c6 Mon Sep 17 00:00:00 2001 From: Jay Zhuang Date: Sat, 9 Jan 2021 13:22:01 -0800 Subject: [PATCH 06/25] Fix checkpoint_test hang (#7849) Summary: `CheckpointTest.CurrentFileModifiedWhileCheckpointing` could hang because now create checkpoint triggers flush twice. The test should wait both flush done. Pull Request resolved: https://github.com/facebook/rocksdb/pull/7849 Test Plan: `gtest-parallel ./checkpoint_test --gtest_filter=CheckpointTest.CurrentFileModifiedWhileCheckpointing -r 100` Reviewed By: ajkr Differential Revision: D25860713 Pulled By: jay-zhuang fbshipit-source-id: e1c2f23037dedc33e205519f4289a25e77816b41 --- utilities/checkpoint/checkpoint_impl.cc | 1 + utilities/checkpoint/checkpoint_test.cc | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/utilities/checkpoint/checkpoint_impl.cc b/utilities/checkpoint/checkpoint_impl.cc index b533c1d3d..bea5d69e2 100644 --- a/utilities/checkpoint/checkpoint_impl.cc +++ b/utilities/checkpoint/checkpoint_impl.cc @@ -247,6 +247,7 @@ Status CheckpointImpl::CreateCustomCheckpoint( // GetLiveFiles atomically. But that needs changes to GetLiveFiles' signature // which is a public API. s = db_->GetLiveFiles(live_files, &manifest_file_size, flush_memtable); + TEST_SYNC_POINT("CheckpointImpl::CreateCheckpoint:FlushDone"); TEST_SYNC_POINT("CheckpointImpl::CreateCheckpoint:SavedLiveFiles1"); TEST_SYNC_POINT("CheckpointImpl::CreateCheckpoint:SavedLiveFiles2"); diff --git a/utilities/checkpoint/checkpoint_test.cc b/utilities/checkpoint/checkpoint_test.cc index 784fb5d46..d63f34483 100644 --- a/utilities/checkpoint/checkpoint_test.cc +++ b/utilities/checkpoint/checkpoint_test.cc @@ -549,7 +549,7 @@ TEST_F(CheckpointTest, CurrentFileModifiedWhileCheckpointing) { {// Get past the flush in the checkpoint thread before adding any keys to // the db so the checkpoint thread won't hit the WriteManifest // syncpoints. - {"DBImpl::GetLiveFiles:1", + {"CheckpointImpl::CreateCheckpoint:FlushDone", "CheckpointTest::CurrentFileModifiedWhileCheckpointing:PrePut"}, // Roll the manifest during checkpointing right after live files are // snapshotted. From 0a91c691a902c585eb5eacc02ba5b61c138f278b Mon Sep 17 00:00:00 2001 From: Cheng Chang Date: Mon, 11 Jan 2021 13:33:08 -0800 Subject: [PATCH 07/25] Add note for PR 7789 in history (#7855) Summary: Pull Request resolved: https://github.com/facebook/rocksdb/pull/7855 Reviewed By: ajkr Differential Revision: D25872797 Pulled By: cheng-chang fbshipit-source-id: 82159a13f897aaaad5f3c70c7dfa822e073bc623 --- HISTORY.md | 1 + 1 file changed, 1 insertion(+) diff --git a/HISTORY.md b/HISTORY.md index d6e8f922c..b27643636 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -12,6 +12,7 @@ * Fixed the logic of populating native data structure for `read_amp_bytes_per_bit` during OPTIONS file parsing on big-endian architecture. Without this fix, original code introduced in PR7659, when running on big-endian machine, can mistakenly store read_amp_bytes_per_bit (an uint32) in little endian format. Future access to `read_amp_bytes_per_bit` will give wrong values. Little endian architecture is not affected. * Fixed prefix extractor with timestamp issues. * Fixed a bug in atomic flush: in two-phase commit mode, the minimum WAL log number to keep is incorrect. +* Fixed a bug related to checkpoint in PR7789: if there are multiple column families, and the checkpoint is not opened as read only, then in rare cases, data loss may happen in the checkpoint. Since backup engine relies on checkpoint, it may also be affected. ### New Features * User defined timestamp feature supports `CompactRange` and `GetApproximateSizes`. From 67f8189e68435b8c36ca9f08ccfd83deeef199a4 Mon Sep 17 00:00:00 2001 From: Cheng Chang Date: Tue, 19 Jan 2021 16:08:09 -0800 Subject: [PATCH 08/25] Update HISTORY.md --- HISTORY.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/HISTORY.md b/HISTORY.md index b27643636..89ec93a44 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -21,6 +21,7 @@ ### Public API Change * Deprecated public but rarely-used FilterBitsBuilder::CalculateNumEntry, which is replaced with ApproximateNumEntries taking a size_t parameter and returning size_t. +* Added a new option `track_and_verify_wals_in_manifest`. If `true`, the log numbers and sizes of the synced WALs are tracked in MANIFEST, then during DB recovery, if a synced WAL is missing from disk, or the WAL's size does not match the recorded size in MANIFEST, an error will be reported and the recovery will be aborted. Note that this option does not work with secondary instance. ## 6.15.0 (11/13/2020) ### Bug Fixes @@ -42,7 +43,6 @@ ### Public API Change * Deprecate `BlockBasedTableOptions::pin_l0_filter_and_index_blocks_in_cache` and `BlockBasedTableOptions::pin_top_level_index_and_filter`. These options still take effect until users migrate to the replacement APIs in `BlockBasedTableOptions::metadata_cache_options`. Migration guidance can be found in the API comments on the deprecated options. * Add new API `DB::VerifyFileChecksums` to verify SST file checksum with corresponding entries in the MANIFEST if present. Current implementation requires scanning and recomputing file checksums. -* Added a new option `track_and_verify_wals_in_manifest`. If `true`, the log numbers and sizes of the synced WALs are tracked in MANIFEST, then during DB recovery, if a synced WAL is missing from disk, or the WAL's size does not match the recorded size in MANIFEST, an error will be reported and the recovery will be aborted. Note that this option does not work with secondary instance. ### Behavior Changes * The dictionary compression settings specified in `ColumnFamilyOptions::compression_opts` now additionally affect files generated by flush and compaction to non-bottommost level. Previously those settings at most affected files generated by compaction to bottommost level, depending on whether `ColumnFamilyOptions::bottommost_compression_opts` overrode them. Users who relied on dictionary compression settings in `ColumnFamilyOptions::compression_opts` affecting only the bottommost level can keep the behavior by moving their dictionary settings to `ColumnFamilyOptions::bottommost_compression_opts` and setting its `enabled` flag. From e931bbfec048f0de9c83719f84bedc51b4df8700 Mon Sep 17 00:00:00 2001 From: Cheng Chang Date: Tue, 19 Jan 2021 19:26:05 -0800 Subject: [PATCH 09/25] Make it able to ignore WAL related VersionEdits in older versions (#7873) Summary: Although the tags for `WalAddition`, `WalDeletion` are after `kTagSafeIgnoreMask`, to actually be able to skip these entries in older versions of RocksDB, we require that they are encoded with their encoded size as the prefix. This requirement is not met in the current codebase, so a downgraded DB may fail to open if these entries exist in the MANIFEST. If a DB wants to downgrade, and its MANIFEST contains `WalAddition` or `WalDeletion`, it can set `track_and_verify_wals_in_manifest` to `false`, then restart twice, then downgrade. On the first restart, a new MANIFEST will be created with a `WalDeletion` indicating that all previously tracked WALs are removed from MANIFEST. On the second restart, since there is no tracked WALs in MANIFEST now, a new MANIFEST will be created with neither `WalAddition` nor `WalDeletion`. Then the DB can downgrade. Tags for `BlobFileAddition`, `BlobFileGarbage` also have the same problem, but this PR focuses on solving the problem for WAL edits. Pull Request resolved: https://github.com/facebook/rocksdb/pull/7873 Test Plan: Added a `VersionEditTest::IgnorableTags` unit test to verify all entries with tags larger than `kTagSafeIgnoreMask` can actually be skipped and won't affect parsing of other entries. Reviewed By: ajkr Differential Revision: D25935930 Pulled By: cheng-chang fbshipit-source-id: 7a02fdba4311d6084328c14aed110a26d08c3efb --- db/version_edit.cc | 56 ++++++++++++++++++++-- db/version_edit.h | 2 + db/version_edit_test.cc | 102 ++++++++++++++++++++++++++++++++++------ 3 files changed, 142 insertions(+), 18 deletions(-) diff --git a/db/version_edit.cc b/db/version_edit.cc index ddaadc58d..284b65f71 100644 --- a/db/version_edit.cc +++ b/db/version_edit.cc @@ -226,13 +226,17 @@ bool VersionEdit::EncodeTo(std::string* dst) const { } for (const auto& wal_addition : wal_additions_) { - PutVarint32(dst, kWalAddition); - wal_addition.EncodeTo(dst); + PutVarint32(dst, kWalAddition2); + std::string encoded; + wal_addition.EncodeTo(&encoded); + PutLengthPrefixedSlice(dst, encoded); } if (!wal_deletion_.IsEmpty()) { - PutVarint32(dst, kWalDeletion); - wal_deletion_.EncodeTo(dst); + PutVarint32(dst, kWalDeletion2); + std::string encoded; + wal_deletion_.EncodeTo(&encoded); + PutLengthPrefixedSlice(dst, encoded); } // 0 is default and does not need to be explicitly written @@ -375,6 +379,11 @@ const char* VersionEdit::DecodeNewFile4From(Slice* input) { Status VersionEdit::DecodeFrom(const Slice& src) { Clear(); +#ifndef NDEBUG + bool ignore_ignorable_tags = false; + TEST_SYNC_POINT_CALLBACK("VersionEdit::EncodeTo:IgnoreIgnorableTags", + &ignore_ignorable_tags); +#endif Slice input = src; const char* msg = nullptr; uint32_t tag = 0; @@ -385,6 +394,11 @@ Status VersionEdit::DecodeFrom(const Slice& src) { Slice str; InternalKey key; while (msg == nullptr && GetVarint32(&input, &tag)) { +#ifndef NDEBUG + if (ignore_ignorable_tags && tag > kTagSafeIgnoreMask) { + tag = kTagSafeIgnoreMask; + } +#endif switch (tag) { case kDbId: if (GetLengthPrefixedSlice(&input, &str)) { @@ -575,6 +589,23 @@ Status VersionEdit::DecodeFrom(const Slice& src) { break; } + case kWalAddition2: { + Slice encoded; + if (!GetLengthPrefixedSlice(&input, &encoded)) { + msg = "WalAddition not prefixed by length"; + break; + } + + WalAddition wal_addition; + const Status s = wal_addition.DecodeFrom(&encoded); + if (!s.ok()) { + return s; + } + + wal_additions_.emplace_back(std::move(wal_addition)); + break; + } + case kWalDeletion: { WalDeletion wal_deletion; const Status s = wal_deletion.DecodeFrom(&input); @@ -586,6 +617,23 @@ Status VersionEdit::DecodeFrom(const Slice& src) { break; } + case kWalDeletion2: { + Slice encoded; + if (!GetLengthPrefixedSlice(&input, &encoded)) { + msg = "WalDeletion not prefixed by length"; + break; + } + + WalDeletion wal_deletion; + const Status s = wal_deletion.DecodeFrom(&encoded); + if (!s.ok()) { + return s; + } + + wal_deletion_ = std::move(wal_deletion); + break; + } + case kColumnFamily: if (!GetVarint32(&input, &column_family_)) { if (!msg) { diff --git a/db/version_edit.h b/db/version_edit.h index 6b045878b..a80543a0d 100644 --- a/db/version_edit.h +++ b/db/version_edit.h @@ -62,6 +62,8 @@ enum Tag : uint32_t { kWalAddition, kWalDeletion, kFullHistoryTsLow, + kWalAddition2, + kWalDeletion2, }; enum NewFileCustomTag : uint32_t { diff --git a/db/version_edit_test.cc b/db/version_edit_test.cc index a0869b3c7..43ae6840f 100644 --- a/db/version_edit_test.cc +++ b/db/version_edit_test.cc @@ -324,14 +324,22 @@ TEST_F(VersionEditTest, AddWalEncodeDecode) { TestEncodeDecode(edit); } +static std::string PrefixEncodedWalAdditionWithLength( + const std::string& encoded) { + std::string ret; + PutVarint32(&ret, Tag::kWalAddition2); + PutLengthPrefixedSlice(&ret, encoded); + return ret; +} + TEST_F(VersionEditTest, AddWalDecodeBadLogNumber) { std::string encoded; - PutVarint32(&encoded, Tag::kWalAddition); { // No log number. + std::string encoded_edit = PrefixEncodedWalAdditionWithLength(encoded); VersionEdit edit; - Status s = edit.DecodeFrom(encoded); + Status s = edit.DecodeFrom(encoded_edit); ASSERT_TRUE(s.IsCorruption()); ASSERT_TRUE(s.ToString().find("Error decoding WAL log number") != std::string::npos) @@ -345,8 +353,10 @@ TEST_F(VersionEditTest, AddWalDecodeBadLogNumber) { unsigned char* ptr = reinterpret_cast(&c); *ptr = 128; encoded.append(1, c); + + std::string encoded_edit = PrefixEncodedWalAdditionWithLength(encoded); VersionEdit edit; - Status s = edit.DecodeFrom(encoded); + Status s = edit.DecodeFrom(encoded_edit); ASSERT_TRUE(s.IsCorruption()); ASSERT_TRUE(s.ToString().find("Error decoding WAL log number") != std::string::npos) @@ -358,14 +368,14 @@ TEST_F(VersionEditTest, AddWalDecodeBadTag) { constexpr WalNumber kLogNumber = 100; constexpr uint64_t kSizeInBytes = 100; - std::string encoded_without_tag; - PutVarint32(&encoded_without_tag, Tag::kWalAddition); - PutVarint64(&encoded_without_tag, kLogNumber); + std::string encoded; + PutVarint64(&encoded, kLogNumber); { // No tag. + std::string encoded_edit = PrefixEncodedWalAdditionWithLength(encoded); VersionEdit edit; - Status s = edit.DecodeFrom(encoded_without_tag); + Status s = edit.DecodeFrom(encoded_edit); ASSERT_TRUE(s.IsCorruption()); ASSERT_TRUE(s.ToString().find("Error decoding tag") != std::string::npos) << s.ToString(); @@ -373,12 +383,15 @@ TEST_F(VersionEditTest, AddWalDecodeBadTag) { { // Only has size tag, no terminate tag. - std::string encoded_with_size = encoded_without_tag; + std::string encoded_with_size = encoded; PutVarint32(&encoded_with_size, static_cast(WalAdditionTag::kSyncedSize)); PutVarint64(&encoded_with_size, kSizeInBytes); + + std::string encoded_edit = + PrefixEncodedWalAdditionWithLength(encoded_with_size); VersionEdit edit; - Status s = edit.DecodeFrom(encoded_with_size); + Status s = edit.DecodeFrom(encoded_edit); ASSERT_TRUE(s.IsCorruption()); ASSERT_TRUE(s.ToString().find("Error decoding tag") != std::string::npos) << s.ToString(); @@ -386,11 +399,14 @@ TEST_F(VersionEditTest, AddWalDecodeBadTag) { { // Only has terminate tag. - std::string encoded_with_terminate = encoded_without_tag; + std::string encoded_with_terminate = encoded; PutVarint32(&encoded_with_terminate, static_cast(WalAdditionTag::kTerminate)); + + std::string encoded_edit = + PrefixEncodedWalAdditionWithLength(encoded_with_terminate); VersionEdit edit; - ASSERT_OK(edit.DecodeFrom(encoded_with_terminate)); + ASSERT_OK(edit.DecodeFrom(encoded_edit)); auto& wal_addition = edit.GetWalAdditions()[0]; ASSERT_EQ(wal_addition.GetLogNumber(), kLogNumber); ASSERT_FALSE(wal_addition.GetMetadata().HasSyncedSize()); @@ -401,15 +417,15 @@ TEST_F(VersionEditTest, AddWalDecodeNoSize) { constexpr WalNumber kLogNumber = 100; std::string encoded; - PutVarint32(&encoded, Tag::kWalAddition); PutVarint64(&encoded, kLogNumber); PutVarint32(&encoded, static_cast(WalAdditionTag::kSyncedSize)); // No real size after the size tag. { // Without terminate tag. + std::string encoded_edit = PrefixEncodedWalAdditionWithLength(encoded); VersionEdit edit; - Status s = edit.DecodeFrom(encoded); + Status s = edit.DecodeFrom(encoded_edit); ASSERT_TRUE(s.IsCorruption()); ASSERT_TRUE(s.ToString().find("Error decoding WAL file size") != std::string::npos) @@ -419,8 +435,10 @@ TEST_F(VersionEditTest, AddWalDecodeNoSize) { { // With terminate tag. PutVarint32(&encoded, static_cast(WalAdditionTag::kTerminate)); + + std::string encoded_edit = PrefixEncodedWalAdditionWithLength(encoded); VersionEdit edit; - Status s = edit.DecodeFrom(encoded); + Status s = edit.DecodeFrom(encoded_edit); ASSERT_TRUE(s.IsCorruption()); // The terminate tag is misunderstood as the size. ASSERT_TRUE(s.ToString().find("Error decoding tag") != std::string::npos) @@ -515,6 +533,62 @@ TEST_F(VersionEditTest, FullHistoryTsLow) { TestEncodeDecode(edit); } +// Tests that if RocksDB is downgraded, the new types of VersionEdits +// that have a tag larger than kTagSafeIgnoreMask can be safely ignored. +TEST_F(VersionEditTest, IgnorableTags) { + SyncPoint::GetInstance()->SetCallBack( + "VersionEdit::EncodeTo:IgnoreIgnorableTags", [&](void* arg) { + bool* ignore = static_cast(arg); + *ignore = true; + }); + SyncPoint::GetInstance()->EnableProcessing(); + + constexpr uint64_t kPrevLogNumber = 100; + constexpr uint64_t kLogNumber = 200; + constexpr uint64_t kNextFileNumber = 300; + constexpr uint64_t kColumnFamilyId = 400; + + VersionEdit edit; + // Add some ignorable entries. + for (int i = 0; i < 2; i++) { + edit.AddWal(i + 1, WalMetadata(i + 2)); + } + edit.SetDBId("db_id"); + // Add unignorable entries. + edit.SetPrevLogNumber(kPrevLogNumber); + edit.SetLogNumber(kLogNumber); + // Add more ignorable entries. + edit.DeleteWalsBefore(100); + // Add unignorable entry. + edit.SetNextFile(kNextFileNumber); + // Add more ignorable entries. + edit.SetFullHistoryTsLow("ts"); + // Add unignorable entry. + edit.SetColumnFamily(kColumnFamilyId); + + std::string encoded; + ASSERT_TRUE(edit.EncodeTo(&encoded)); + + VersionEdit decoded; + ASSERT_OK(decoded.DecodeFrom(encoded)); + + // Check that all ignorable entries are ignored. + ASSERT_FALSE(decoded.HasDbId()); + ASSERT_FALSE(decoded.HasFullHistoryTsLow()); + ASSERT_FALSE(decoded.IsWalAddition()); + ASSERT_FALSE(decoded.IsWalDeletion()); + ASSERT_TRUE(decoded.GetWalAdditions().empty()); + ASSERT_TRUE(decoded.GetWalDeletion().IsEmpty()); + + // Check that unignorable entries are still present. + ASSERT_EQ(edit.GetPrevLogNumber(), kPrevLogNumber); + ASSERT_EQ(edit.GetLogNumber(), kLogNumber); + ASSERT_EQ(edit.GetNextFile(), kNextFileNumber); + ASSERT_EQ(edit.GetColumnFamily(), kColumnFamilyId); + + SyncPoint::GetInstance()->DisableProcessing(); +} + } // namespace ROCKSDB_NAMESPACE int main(int argc, char** argv) { From 89dd231f6af8d5c4237c3153e036643ea33128fd Mon Sep 17 00:00:00 2001 From: Cheng Chang Date: Wed, 20 Jan 2021 08:38:18 -0800 Subject: [PATCH 10/25] Update HISTORY.md --- HISTORY.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/HISTORY.md b/HISTORY.md index 89ec93a44..acb00a34c 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -1,5 +1,9 @@ # Rocksdb Change Log +## 6.16.1 (1/20/2021) +### Bug Fixes +* Version older than 6.15 cannot decode VersionEdits `WalAddition` and `WalDeletion`, fixed this by changing the encoded format of them to be ignorable by older versions. + ## 6.16.0 (12/18/2020) ### Behavior Changes * Attempting to write a merge operand without explicitly configuring `merge_operator` now fails immediately, causing the DB to enter read-only mode. Previously, failure was deferred until the `merge_operator` was needed by a user read or a background operation. From be40c99dda6147ad895166246a3d0290e8df8ca2 Mon Sep 17 00:00:00 2001 From: Cheng Chang Date: Wed, 20 Jan 2021 08:40:10 -0800 Subject: [PATCH 11/25] bump to 6.16.1 --- include/rocksdb/version.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/rocksdb/version.h b/include/rocksdb/version.h index bf4fe15e7..aeb604f03 100644 --- a/include/rocksdb/version.h +++ b/include/rocksdb/version.h @@ -6,7 +6,7 @@ #define ROCKSDB_MAJOR 6 #define ROCKSDB_MINOR 16 -#define ROCKSDB_PATCH 0 +#define ROCKSDB_PATCH 1 // Do not use these. We made the mistake of declaring macros starting with // double underscore. Now we have to live with our choice. We'll deprecate these From ba7c46a3d873a2595cfb68ed97bc3e2710b457bf Mon Sep 17 00:00:00 2001 From: Andrew Kryczka Date: Thu, 21 Jan 2021 08:47:06 -0800 Subject: [PATCH 12/25] workaround race conditions during `PeriodicWorkScheduler` registration (#7888) Summary: This provides a workaround for two race conditions that will be fixed in a more sophisticated way later. This PR: (1) Makes the client serialize calls to `Timer::Start()` and `Timer::Shutdown()` (see https://github.com/facebook/rocksdb/issues/7711). The long-term fix will be to make those functions thread-safe. (2) Makes `PeriodicWorkScheduler` atomically add/cancel work together with starting/shutting down its `Timer`. The long-term fix will be for `Timer` API to offer more specialized APIs so the client will not need to synchronize. Pull Request resolved: https://github.com/facebook/rocksdb/pull/7888 Test Plan: ran the repro provided in https://github.com/facebook/rocksdb/issues/7881 Reviewed By: jay-zhuang Differential Revision: D25990891 Pulled By: ajkr fbshipit-source-id: a97fdaebbda6d7db7ddb1b146738b68c16c5be38 --- HISTORY.md | 3 +++ db/periodic_work_scheduler.cc | 9 +++++++-- db/periodic_work_scheduler.h | 6 ++++++ util/timer.h | 3 +++ 4 files changed, 19 insertions(+), 2 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index acb00a34c..2adef8d7f 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -1,4 +1,7 @@ # Rocksdb Change Log +## Unreleased +### Bug Fixes +* Fix a race condition between DB startups and shutdowns in managing the periodic background worker threads. One effect of this race condition could be the process being terminated. ## 6.16.1 (1/20/2021) ### Bug Fixes diff --git a/db/periodic_work_scheduler.cc b/db/periodic_work_scheduler.cc index cc6f714d9..da0bc1e4b 100644 --- a/db/periodic_work_scheduler.cc +++ b/db/periodic_work_scheduler.cc @@ -10,13 +10,14 @@ #ifndef ROCKSDB_LITE namespace ROCKSDB_NAMESPACE { -PeriodicWorkScheduler::PeriodicWorkScheduler(Env* env) { +PeriodicWorkScheduler::PeriodicWorkScheduler(Env* env) : timer_mu_(env) { timer = std::unique_ptr(new Timer(env)); } void PeriodicWorkScheduler::Register(DBImpl* dbi, unsigned int stats_dump_period_sec, unsigned int stats_persist_period_sec) { + MutexLock l(&timer_mu_); static std::atomic initial_delay(0); timer->Start(); if (stats_dump_period_sec > 0) { @@ -41,6 +42,7 @@ void PeriodicWorkScheduler::Register(DBImpl* dbi, } void PeriodicWorkScheduler::Unregister(DBImpl* dbi) { + MutexLock l(&timer_mu_); timer->Cancel(GetTaskName(dbi, "dump_st")); timer->Cancel(GetTaskName(dbi, "pst_st")); timer->Cancel(GetTaskName(dbi, "flush_info_log")); @@ -78,7 +80,10 @@ PeriodicWorkTestScheduler* PeriodicWorkTestScheduler::Default(Env* env) { MutexLock l(&mutex); if (scheduler.timer.get() != nullptr && scheduler.timer->TEST_GetPendingTaskNum() == 0) { - scheduler.timer->Shutdown(); + { + MutexLock timer_mu_guard(&scheduler.timer_mu_); + scheduler.timer->Shutdown(); + } scheduler.timer.reset(new Timer(env)); } } diff --git a/db/periodic_work_scheduler.h b/db/periodic_work_scheduler.h index 6c1ce314c..9382adc44 100644 --- a/db/periodic_work_scheduler.h +++ b/db/periodic_work_scheduler.h @@ -42,6 +42,12 @@ class PeriodicWorkScheduler { protected: std::unique_ptr timer; + // `timer_mu_` serves two purposes currently: + // (1) to ensure calls to `Start()` and `Shutdown()` are serialized, as + // they are currently not implemented in a thread-safe way; and + // (2) to ensure the `Timer::Add()`s and `Timer::Start()` run atomically, and + // the `Timer::Cancel()`s and `Timer::Shutdown()` run atomically. + port::Mutex timer_mu_; explicit PeriodicWorkScheduler(Env* env); diff --git a/util/timer.h b/util/timer.h index 8e12d7d7b..b6ee42ed0 100644 --- a/util/timer.h +++ b/util/timer.h @@ -22,6 +22,9 @@ namespace ROCKSDB_NAMESPACE { // A Timer class to handle repeated work. // +// `Start()` and `Shutdown()` are currently not thread-safe. The client must +// serialize calls to these two member functions. +// // A single timer instance can handle multiple functions via a single thread. // It is better to leave long running work to a dedicated thread pool. // From 5105db8d7bea2729df1dc793647bd1e471ff7ce7 Mon Sep 17 00:00:00 2001 From: Andrew Kryczka Date: Thu, 21 Jan 2021 12:36:23 -0800 Subject: [PATCH 13/25] update HISTORY.md and bump version for 6.16.2 --- HISTORY.md | 2 +- include/rocksdb/version.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index 2adef8d7f..f529bbe1b 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -1,5 +1,5 @@ # Rocksdb Change Log -## Unreleased +## 6.16.2 (1/21/2021) ### Bug Fixes * Fix a race condition between DB startups and shutdowns in managing the periodic background worker threads. One effect of this race condition could be the process being terminated. diff --git a/include/rocksdb/version.h b/include/rocksdb/version.h index aeb604f03..7ea0cd50a 100644 --- a/include/rocksdb/version.h +++ b/include/rocksdb/version.h @@ -6,7 +6,7 @@ #define ROCKSDB_MAJOR 6 #define ROCKSDB_MINOR 16 -#define ROCKSDB_PATCH 1 +#define ROCKSDB_PATCH 2 // Do not use these. We made the mistake of declaring macros starting with // double underscore. Now we have to live with our choice. We'll deprecate these From 031368f67a1c55e3cb28ec24914d64b6001cd115 Mon Sep 17 00:00:00 2001 From: Andrew Kryczka Date: Fri, 5 Feb 2021 15:55:34 -0800 Subject: [PATCH 14/25] Allow range deletions in `*TransactionDB` only when safe (#7929) Summary: Explicitly reject all range deletions on `TransactionDB` or `OptimisticTransactionDB`, except when the user provides sufficient promises that allow us to proceed safely. The necessary promises are described in the API doc for `TransactionDB::DeleteRange()`. There is currently no way to provide enough promises to make it safe in `OptimisticTransactionDB`. Fixes https://github.com/facebook/rocksdb/issues/7913. Pull Request resolved: https://github.com/facebook/rocksdb/pull/7929 Test Plan: unit tests covering the cases it's permitted/rejected Reviewed By: ltamasi Differential Revision: D26240254 Pulled By: ajkr fbshipit-source-id: 2834a0ce64cc3e4c3799e35b885a5e79c2f4f6d9 --- HISTORY.md | 5 ++ .../utilities/optimistic_transaction_db.h | 2 + include/rocksdb/utilities/transaction_db.h | 11 ++++ .../optimistic_transaction_db_impl.h | 16 ++++++ .../optimistic_transaction_test.cc | 11 ++++ utilities/transactions/transaction_test.cc | 50 +++++++++++++++++++ .../transactions/write_prepared_txn_db.cc | 4 +- 7 files changed, 98 insertions(+), 1 deletion(-) diff --git a/HISTORY.md b/HISTORY.md index f529bbe1b..c787e8b7c 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -1,4 +1,9 @@ # Rocksdb Change Log +## Unreleased +### Bug Fixes +* Since 6.15.0, `TransactionDB` returns error `Status`es from calls to `DeleteRange()` and calls to `Write()` where the `WriteBatch` contains a range deletion. Previously such operations may have succeeded while not providing the expected transactional guarantees. There are certain cases where range deletion can still be used on such DBs; see the API doc on `TransactionDB::DeleteRange()` for details. +* `OptimisticTransactionDB` now returns error `Status`es from calls to `DeleteRange()` and calls to `Write()` where the `WriteBatch` contains a range deletion. Previously such operations may have succeeded while not providing the expected transactional guarantees. + ## 6.16.2 (1/21/2021) ### Bug Fixes * Fix a race condition between DB startups and shutdowns in managing the periodic background worker threads. One effect of this race condition could be the process being terminated. diff --git a/include/rocksdb/utilities/optimistic_transaction_db.h b/include/rocksdb/utilities/optimistic_transaction_db.h index 5356df71f..c070e49a3 100644 --- a/include/rocksdb/utilities/optimistic_transaction_db.h +++ b/include/rocksdb/utilities/optimistic_transaction_db.h @@ -51,6 +51,8 @@ struct OptimisticTransactionDBOptions { uint32_t occ_lock_buckets = (1 << 20); }; +// Range deletions (including those in `WriteBatch`es passed to `Write()`) are +// incompatible with `OptimisticTransactionDB` and will return a non-OK `Status` class OptimisticTransactionDB : public StackableDB { public: // Open an OptimisticTransactionDB similar to DB::Open(). diff --git a/include/rocksdb/utilities/transaction_db.h b/include/rocksdb/utilities/transaction_db.h index a48847cd2..c1140ecac 100644 --- a/include/rocksdb/utilities/transaction_db.h +++ b/include/rocksdb/utilities/transaction_db.h @@ -340,6 +340,17 @@ class TransactionDB : public StackableDB { // falls back to the un-optimized version of ::Write return Write(opts, updates); } + // Transactional `DeleteRange()` is not yet supported. + // However, users who know their deleted range does not conflict with + // anything can still use it via the `Write()` API. In all cases, the + // `Write()` overload specifying `TransactionDBWriteOptimizations` must be + // used and `skip_concurrency_control` must be set. When using either + // WRITE_PREPARED or WRITE_UNPREPARED , `skip_duplicate_key_check` must + // additionally be set. + virtual Status DeleteRange(const WriteOptions&, ColumnFamilyHandle*, + const Slice&, const Slice&) override { + return Status::NotSupported(); + } // Open a TransactionDB similar to DB::Open(). // Internally call PrepareWrap() and WrapDB() // If the return status is not ok, then dbptr is set to nullptr. diff --git a/utilities/transactions/optimistic_transaction_db_impl.h b/utilities/transactions/optimistic_transaction_db_impl.h index d895d49b8..a23d9a06d 100644 --- a/utilities/transactions/optimistic_transaction_db_impl.h +++ b/utilities/transactions/optimistic_transaction_db_impl.h @@ -46,6 +46,22 @@ class OptimisticTransactionDBImpl : public OptimisticTransactionDB { const OptimisticTransactionOptions& txn_options, Transaction* old_txn) override; + // Transactional `DeleteRange()` is not yet supported. + virtual Status DeleteRange(const WriteOptions&, ColumnFamilyHandle*, + const Slice&, const Slice&) override { + return Status::NotSupported(); + } + + // Range deletions also must not be snuck into `WriteBatch`es as they are + // incompatible with `OptimisticTransactionDB`. + virtual Status Write(const WriteOptions& write_opts, + WriteBatch* batch) override { + if (batch->HasDeleteRange()) { + return Status::NotSupported(); + } + return OptimisticTransactionDB::Write(write_opts, batch); + } + size_t GetLockBucketsSize() const { return bucketed_locks_.size(); } OccValidationPolicy GetValidatePolicy() const { return validate_policy_; } diff --git a/utilities/transactions/optimistic_transaction_test.cc b/utilities/transactions/optimistic_transaction_test.cc index ad27bd964..138823b65 100644 --- a/utilities/transactions/optimistic_transaction_test.cc +++ b/utilities/transactions/optimistic_transaction_test.cc @@ -1033,6 +1033,17 @@ TEST_P(OptimisticTransactionTest, IteratorTest) { delete txn; } +TEST_P(OptimisticTransactionTest, DeleteRangeSupportTest) { + // `OptimisticTransactionDB` does not allow range deletion in any API. + ASSERT_TRUE( + txn_db + ->DeleteRange(WriteOptions(), txn_db->DefaultColumnFamily(), "a", "b") + .IsNotSupported()); + WriteBatch wb; + ASSERT_OK(wb.DeleteRange("a", "b")); + ASSERT_NOK(txn_db->Write(WriteOptions(), &wb)); +} + TEST_P(OptimisticTransactionTest, SavepointTest) { WriteOptions write_options; ReadOptions read_options, snapshot_read_options; diff --git a/utilities/transactions/transaction_test.cc b/utilities/transactions/transaction_test.cc index 9c4ce5604..27f9504e0 100644 --- a/utilities/transactions/transaction_test.cc +++ b/utilities/transactions/transaction_test.cc @@ -4835,6 +4835,56 @@ TEST_P(TransactionTest, MergeTest) { ASSERT_EQ("a,3", value); } +TEST_P(TransactionTest, DeleteRangeSupportTest) { + // The `DeleteRange()` API is banned everywhere. + ASSERT_TRUE( + db->DeleteRange(WriteOptions(), db->DefaultColumnFamily(), "a", "b") + .IsNotSupported()); + + // But range deletions can be added via the `Write()` API by specifying the + // proper flags to promise there are no conflicts according to the DB type + // (see `TransactionDB::DeleteRange()` API doc for details). + for (bool skip_concurrency_control : {false, true}) { + for (bool skip_duplicate_key_check : {false, true}) { + ASSERT_OK(db->Put(WriteOptions(), "a", "val")); + WriteBatch wb; + ASSERT_OK(wb.DeleteRange("a", "b")); + TransactionDBWriteOptimizations flags; + flags.skip_concurrency_control = skip_concurrency_control; + flags.skip_duplicate_key_check = skip_duplicate_key_check; + Status s = db->Write(WriteOptions(), flags, &wb); + std::string value; + switch (txn_db_options.write_policy) { + case WRITE_COMMITTED: + if (skip_concurrency_control) { + ASSERT_OK(s); + ASSERT_TRUE(db->Get(ReadOptions(), "a", &value).IsNotFound()); + } else { + ASSERT_NOK(s); + ASSERT_OK(db->Get(ReadOptions(), "a", &value)); + } + break; + case WRITE_PREPARED: + // Intentional fall-through + case WRITE_UNPREPARED: + if (skip_concurrency_control && skip_duplicate_key_check) { + ASSERT_OK(s); + ASSERT_TRUE(db->Get(ReadOptions(), "a", &value).IsNotFound()); + } else { + ASSERT_NOK(s); + ASSERT_OK(db->Get(ReadOptions(), "a", &value)); + } + break; + } + // Without any promises from the user, range deletion via other `Write()` + // APIs are still banned. + ASSERT_OK(db->Put(WriteOptions(), "a", "val")); + ASSERT_NOK(db->Write(WriteOptions(), &wb)); + ASSERT_OK(db->Get(ReadOptions(), "a", &value)); + } + } +} + TEST_P(TransactionTest, DeferSnapshotTest) { WriteOptions write_options; ReadOptions read_options; diff --git a/utilities/transactions/write_prepared_txn_db.cc b/utilities/transactions/write_prepared_txn_db.cc index af0df6604..167d2e80c 100644 --- a/utilities/transactions/write_prepared_txn_db.cc +++ b/utilities/transactions/write_prepared_txn_db.cc @@ -157,7 +157,9 @@ Status WritePreparedTxnDB::WriteInternal(const WriteOptions& write_options_orig, // TODO(myabandeh): add an option to allow user skipping this cost SubBatchCounter counter(*GetCFComparatorMap()); auto s = batch->Iterate(&counter); - assert(s.ok()); + if (!s.ok()) { + return s; + } batch_cnt = counter.BatchCount(); WPRecordTick(TXN_DUPLICATE_KEY_OVERHEAD); ROCKS_LOG_DETAILS(info_log_, "Duplicate key overhead: %" PRIu64 " batches", From e32a64aa547ad7f17af74648147e521acbc21d3f Mon Sep 17 00:00:00 2001 From: Andrew Kryczka Date: Fri, 5 Feb 2021 17:09:36 -0800 Subject: [PATCH 15/25] bump version and update HISTORY.md for 6.16.3 --- HISTORY.md | 2 +- include/rocksdb/version.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index c787e8b7c..e56abbeb7 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -1,5 +1,5 @@ # Rocksdb Change Log -## Unreleased +## 6.16.3 (02/05/2021) ### Bug Fixes * Since 6.15.0, `TransactionDB` returns error `Status`es from calls to `DeleteRange()` and calls to `Write()` where the `WriteBatch` contains a range deletion. Previously such operations may have succeeded while not providing the expected transactional guarantees. There are certain cases where range deletion can still be used on such DBs; see the API doc on `TransactionDB::DeleteRange()` for details. * `OptimisticTransactionDB` now returns error `Status`es from calls to `DeleteRange()` and calls to `Write()` where the `WriteBatch` contains a range deletion. Previously such operations may have succeeded while not providing the expected transactional guarantees. diff --git a/include/rocksdb/version.h b/include/rocksdb/version.h index 7ea0cd50a..4d78333f2 100644 --- a/include/rocksdb/version.h +++ b/include/rocksdb/version.h @@ -6,7 +6,7 @@ #define ROCKSDB_MAJOR 6 #define ROCKSDB_MINOR 16 -#define ROCKSDB_PATCH 2 +#define ROCKSDB_PATCH 3 // Do not use these. We made the mistake of declaring macros starting with // double underscore. Now we have to live with our choice. We'll deprecate these From 5884cf5d5b22fc711aa16006542a3d3c67163fe4 Mon Sep 17 00:00:00 2001 From: Adam Retter Date: Mon, 29 Mar 2021 16:31:26 -0700 Subject: [PATCH 16/25] range_tree requires GNU libc on ppc64 (#8070) Summary: If the platform is ppc64 and the libc is not GNU libc, then we exclude the range_tree from compilation. See https://jira.percona.com/browse/PS-7559 Pull Request resolved: https://github.com/facebook/rocksdb/pull/8070 Reviewed By: jay-zhuang Differential Revision: D27246004 Pulled By: mrambacher fbshipit-source-id: 59d8433242ce7ce608988341becb4f83312445f5 --- Makefile | 6 ++++++ buckifier/buckify_rocksdb.py | 4 ++++ build_tools/build_detect_platform | 20 ++++++++++++++++++++ src.mk | 26 ++++++++++++++------------ 4 files changed, 44 insertions(+), 12 deletions(-) diff --git a/Makefile b/Makefile index 7c865532c..000f3ac2c 100644 --- a/Makefile +++ b/Makefile @@ -510,6 +510,12 @@ ifeq ($(USE_FOLLY_DISTRIBUTED_MUTEX),1) LIB_OBJECTS += $(patsubst %.cpp, $(OBJ_DIR)/%.o, $(FOLLY_SOURCES)) endif +# range_tree is not compatible with non GNU libc on ppc64 +# see https://jira.percona.com/browse/PS-7559 +ifneq ($(PPC_LIBC_IS_GNU),0) + LIB_OBJECTS += $(patsubst %.cc, $(OBJ_DIR)/%.o, $(RANGE_TREE_SOURCES)) +endif + GTEST = $(OBJ_DIR)/$(GTEST_DIR)/gtest/gtest-all.o TESTUTIL = $(OBJ_DIR)/test_util/testutil.o TESTHARNESS = $(OBJ_DIR)/test_util/testharness.o $(TESTUTIL) $(GTEST) diff --git a/buckifier/buckify_rocksdb.py b/buckifier/buckify_rocksdb.py index f0909bc61..aac33a244 100644 --- a/buckifier/buckify_rocksdb.py +++ b/buckifier/buckify_rocksdb.py @@ -135,11 +135,15 @@ def generate_targets(repo_path, deps_map): TARGETS.add_library( "rocksdb_lib", src_mk["LIB_SOURCES"] + + # always add range_tree, it's only excluded on ppc64, which we don't use internally + src_mk["RANGE_TREE_SOURCES"] + src_mk["TOOL_LIB_SOURCES"]) # rocksdb_whole_archive_lib TARGETS.add_library( "rocksdb_whole_archive_lib", src_mk["LIB_SOURCES"] + + # always add range_tree, it's only excluded on ppc64, which we don't use internally + src_mk["RANGE_TREE_SOURCES"] + src_mk["TOOL_LIB_SOURCES"], deps=None, headers=None, diff --git a/build_tools/build_detect_platform b/build_tools/build_detect_platform index a686ab06d..3c219ec12 100755 --- a/build_tools/build_detect_platform +++ b/build_tools/build_detect_platform @@ -663,6 +663,23 @@ else fi fi +if test -n "`echo $TARGET_ARCHITECTURE | grep ^ppc64`"; then + # check for GNU libc on ppc64 + $CXX -x c++ - -o /dev/null 2>/dev/null < + #include + #include + + int main(int argc, char *argv[]) { + printf("GNU libc version: %s\n", gnu_get_libc_version()); + return 0; + } +EOF + if [ "$?" != 0 ]; then + PPC_LIBC_IS_GNU=0 + fi +fi + if test "$TRY_SSE_ETC"; then # The USE_SSE flag now means "attempt to compile with widely-available # Intel architecture extensions utilized by specific optimizations in the @@ -856,3 +873,6 @@ echo "LUA_PATH=$LUA_PATH" >> "$OUTPUT" if test -n "$USE_FOLLY_DISTRIBUTED_MUTEX"; then echo "USE_FOLLY_DISTRIBUTED_MUTEX=$USE_FOLLY_DISTRIBUTED_MUTEX" >> "$OUTPUT" fi +if test -n "$PPC_LIBC_IS_GNU"; then + echo "PPC_LIBC_IS_GNU=$PPC_LIBC_IS_GNU" >> "$OUTPUT" +fi diff --git a/src.mk b/src.mk index 2f8077d5b..f03bada34 100644 --- a/src.mk +++ b/src.mk @@ -255,18 +255,6 @@ LIB_SOURCES = \ utilities/transactions/lock/lock_manager.cc \ utilities/transactions/lock/point/point_lock_tracker.cc \ utilities/transactions/lock/point/point_lock_manager.cc \ - utilities/transactions/lock/range/range_tree/lib/locktree/concurrent_tree.cc \ - utilities/transactions/lock/range/range_tree/lib/locktree/keyrange.cc \ - utilities/transactions/lock/range/range_tree/lib/locktree/lock_request.cc \ - utilities/transactions/lock/range/range_tree/lib/locktree/locktree.cc \ - utilities/transactions/lock/range/range_tree/lib/locktree/manager.cc \ - utilities/transactions/lock/range/range_tree/lib/locktree/range_buffer.cc \ - utilities/transactions/lock/range/range_tree/lib/locktree/treenode.cc \ - utilities/transactions/lock/range/range_tree/lib/locktree/txnid_set.cc \ - utilities/transactions/lock/range/range_tree/lib/locktree/wfg.cc \ - utilities/transactions/lock/range/range_tree/lib/standalone_port.cc \ - utilities/transactions/lock/range/range_tree/lib/util/dbt.cc \ - utilities/transactions/lock/range/range_tree/lib/util/memarena.cc \ utilities/transactions/optimistic_transaction.cc \ utilities/transactions/optimistic_transaction_db_impl.cc \ utilities/transactions/pessimistic_transaction.cc \ @@ -298,6 +286,20 @@ LIB_SOURCES_ASM = LIB_SOURCES_C = endif +RANGE_TREE_SOURCES =\ + utilities/transactions/lock/range/range_tree/lib/locktree/concurrent_tree.cc \ + utilities/transactions/lock/range/range_tree/lib/locktree/keyrange.cc \ + utilities/transactions/lock/range/range_tree/lib/locktree/lock_request.cc \ + utilities/transactions/lock/range/range_tree/lib/locktree/locktree.cc \ + utilities/transactions/lock/range/range_tree/lib/locktree/manager.cc \ + utilities/transactions/lock/range/range_tree/lib/locktree/range_buffer.cc \ + utilities/transactions/lock/range/range_tree/lib/locktree/treenode.cc \ + utilities/transactions/lock/range/range_tree/lib/locktree/txnid_set.cc \ + utilities/transactions/lock/range/range_tree/lib/locktree/wfg.cc \ + utilities/transactions/lock/range/range_tree/lib/standalone_port.cc \ + utilities/transactions/lock/range/range_tree/lib/util/dbt.cc \ + utilities/transactions/lock/range/range_tree/lib/util/memarena.cc + TOOL_LIB_SOURCES = \ tools/io_tracer_parser_tool.cc \ tools/ldb_cmd.cc \ From 9857d45f37d9c47e40f933f7d4502d2746642f8b Mon Sep 17 00:00:00 2001 From: Adam Retter Date: Thu, 21 Jan 2021 08:33:55 -0800 Subject: [PATCH 17/25] Fix compilation against musl lib C (#7875) Summary: See https://github.com/percona/PerconaFT/pull/450 Pull Request resolved: https://github.com/facebook/rocksdb/pull/7875 Reviewed By: ajkr Differential Revision: D25938020 Pulled By: jay-zhuang fbshipit-source-id: 9014dbc7b23bf92c5e63bfbdda4565bb0d2f2b58 --- .../lock/range/range_tree/lib/portability/toku_pthread.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/utilities/transactions/lock/range/range_tree/lib/portability/toku_pthread.h b/utilities/transactions/lock/range/range_tree/lib/portability/toku_pthread.h index bd1cc8e6c..571b950e1 100644 --- a/utilities/transactions/lock/range/range_tree/lib/portability/toku_pthread.h +++ b/utilities/transactions/lock/range/range_tree/lib/portability/toku_pthread.h @@ -153,7 +153,12 @@ typedef struct toku_mutex_aligned { { .pmutex = PTHREAD_MUTEX_INITIALIZER, .psi_mutex = nullptr } #endif // defined(TOKU_PTHREAD_DEBUG) #else // __FreeBSD__, __linux__, at least +#if defined(__GLIBC__) #define TOKU_MUTEX_ADAPTIVE PTHREAD_MUTEX_ADAPTIVE_NP +#else +// not all libc (e.g. musl) implement NP (Non-POSIX) attributes +#define TOKU_MUTEX_ADAPTIVE PTHREAD_MUTEX_DEFAULT +#endif #if defined(TOKU_PTHREAD_DEBUG) #define TOKU_ADAPTIVE_MUTEX_INITIALIZER \ { \ From 9d95050ca7f7e336678b3c94d8bb06a6c356f687 Mon Sep 17 00:00:00 2001 From: Jay Zhuang Date: Tue, 30 Mar 2021 15:54:11 -0700 Subject: [PATCH 18/25] Bump version and update HISTORY.md for 6.16.4 --- HISTORY.md | 4 ++++ include/rocksdb/version.h | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/HISTORY.md b/HISTORY.md index e56abbeb7..74bb3d4f8 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -1,4 +1,8 @@ # Rocksdb Change Log +## 6.16.4 (03/30/2021) +### Bug Fixes +* Fix build on ppc64 and musl build. + ## 6.16.3 (02/05/2021) ### Bug Fixes * Since 6.15.0, `TransactionDB` returns error `Status`es from calls to `DeleteRange()` and calls to `Write()` where the `WriteBatch` contains a range deletion. Previously such operations may have succeeded while not providing the expected transactional guarantees. There are certain cases where range deletion can still be used on such DBs; see the API doc on `TransactionDB::DeleteRange()` for details. diff --git a/include/rocksdb/version.h b/include/rocksdb/version.h index 4d78333f2..6a07989fe 100644 --- a/include/rocksdb/version.h +++ b/include/rocksdb/version.h @@ -6,7 +6,7 @@ #define ROCKSDB_MAJOR 6 #define ROCKSDB_MINOR 16 -#define ROCKSDB_PATCH 3 +#define ROCKSDB_PATCH 4 // Do not use these. We made the mistake of declaring macros starting with // double underscore. Now we have to live with our choice. We'll deprecate these From ef2c5c03a61ba984d01cc6d51d7340453395c96b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Simon=20Gra=CC=88tzer?= Date: Thu, 4 Feb 2021 15:38:56 +0100 Subject: [PATCH 19/25] Stardog Patches: WriteBatch + WriteBufferManager --- include/rocksdb/write_batch.h | 4 ++++ include/rocksdb/write_buffer_manager.h | 6 +++++- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/include/rocksdb/write_batch.h b/include/rocksdb/write_batch.h index 51fd4d8ac..dfadeca8a 100644 --- a/include/rocksdb/write_batch.h +++ b/include/rocksdb/write_batch.h @@ -173,6 +173,10 @@ class WriteBatch : public WriteBatchBase { // Otherwise returns Status::OK(). Status PopSavePoint() override; + void setContentFlag(uint32_t theContentFlag) { + content_flags_.store(theContentFlag,std::memory_order_seq_cst); + } + // Support for iterating over the contents of a batch. class Handler { public: diff --git a/include/rocksdb/write_buffer_manager.h b/include/rocksdb/write_buffer_manager.h index ae1c98caf..22b69abb9 100644 --- a/include/rocksdb/write_buffer_manager.h +++ b/include/rocksdb/write_buffer_manager.h @@ -87,8 +87,12 @@ class WriteBufferManager { } } + void SetBufferSize(size_t new_size) { + buffer_size_ = new_size; + } + private: - const size_t buffer_size_; + std::atomic buffer_size_; const size_t mutable_limit_; std::atomic memory_used_; // Memory that hasn't been scheduled to free. From 969d8a5363478fc61e88454eac5bda93ffa5c8d0 Mon Sep 17 00:00:00 2001 From: "U-EC2AMAZ-AK1IV17\\Administrator" Date: Thu, 18 Feb 2021 20:26:09 +0000 Subject: [PATCH 20/25] Remove special mingw handling, does not apply for msys2 --- CMakeLists.txt | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index ed717c2b4..f60495da3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -209,7 +209,7 @@ else() set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -W -Wextra -Wall") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wsign-compare -Wshadow -Wno-unused-parameter -Wno-unused-variable -Woverloaded-virtual -Wnon-virtual-dtor -Wno-missing-field-initializers -Wno-strict-aliasing") if(MINGW) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-format -fno-asynchronous-unwind-tables") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-format") add_definitions(-D_POSIX_C_SOURCE=1) endif() if(NOT CMAKE_BUILD_TYPE STREQUAL "Debug") @@ -869,12 +869,12 @@ if(WIN32) port/win/env_default.cc port/win/port_win.cc port/win/win_logger.cc) - if(NOT MINGW) +# if(NOT MINGW) # Mingw only supports std::thread when using # posix threads. list(APPEND SOURCES port/win/win_thread.cc) - endif() +# endif() if(WITH_XPRESS) list(APPEND SOURCES port/win/xpress_win.cc) From f7ef2361029a66ed4a198105e6371688698cbe52 Mon Sep 17 00:00:00 2001 From: Matthew Von-Maszewski Date: Fri, 26 Feb 2021 15:31:48 -0500 Subject: [PATCH 21/25] Update README.md --- README.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/README.md b/README.md index ece43cd1b..958ea1ec9 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,8 @@ +## Stardog Notes + +* release branch based upon RocksDB 6.x is "stardog/develop" +* release branch based upon RocksDB 5.x is "stardog-5.x-releases" + ## RocksDB: A Persistent Key-Value Store for Flash and RAM Storage [![CircleCI Status](https://circleci.com/gh/facebook/rocksdb.svg?style=svg)](https://circleci.com/gh/facebook/rocksdb) From 54a9cb464dca91deede6a2bfaadd990341356bf1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Simon=20Gra=CC=88tzer?= Date: Thu, 8 Apr 2021 22:50:33 +0200 Subject: [PATCH 22/25] Create DB properties for internal table cache --- db/db_properties_test.cc | 49 +++++++++++++++++++++++++++++++++++++--- db/internal_stats.cc | 24 ++++++++++++++++++++ db/internal_stats.h | 23 +++++++++++-------- include/rocksdb/db.h | 9 ++++++++ 4 files changed, 92 insertions(+), 13 deletions(-) diff --git a/db/db_properties_test.cc b/db/db_properties_test.cc index d3333fa93..8f748c5b9 100644 --- a/db/db_properties_test.cc +++ b/db/db_properties_test.cc @@ -245,7 +245,8 @@ void GetExpectedTableProperties( const int kDeletionCount = kTableCount * kDeletionsPerTable; const int kMergeCount = kTableCount * kMergeOperandsPerTable; const int kRangeDeletionCount = kTableCount * kRangeDeletionsPerTable; - const int kKeyCount = kPutCount + kDeletionCount + kMergeCount + kRangeDeletionCount; + const int kKeyCount = + kPutCount + kDeletionCount + kMergeCount + kRangeDeletionCount; const int kAvgSuccessorSize = kKeySize / 5; const int kEncodingSavePerKey = kKeySize / 4; expected_tp->raw_key_size = kKeyCount * (kKeySize + 8); @@ -256,7 +257,8 @@ void GetExpectedTableProperties( expected_tp->num_merge_operands = kMergeCount; expected_tp->num_range_deletions = kRangeDeletionCount; expected_tp->num_data_blocks = - kTableCount * (kKeysPerTable * (kKeySize - kEncodingSavePerKey + kValueSize)) / + kTableCount * + (kKeysPerTable * (kKeySize - kEncodingSavePerKey + kValueSize)) / kBlockSize; expected_tp->data_size = kTableCount * (kKeysPerTable * (kKeySize + 8 + kValueSize)); @@ -1090,7 +1092,8 @@ class CountingUserTblPropCollector : public TablePropertiesCollector { std::string encoded; PutVarint32(&encoded, count_); *properties = UserCollectedProperties{ - {"CountingUserTblPropCollector", message_}, {"Count", encoded}, + {"CountingUserTblPropCollector", message_}, + {"Count", encoded}, }; return Status::OK(); } @@ -1713,6 +1716,46 @@ TEST_F(DBPropertiesTest, BlockCacheProperties) { ASSERT_EQ(0, value); } +TEST_F(DBPropertiesTest, TableCacheProperties) { + Options options; + uint64_t value, new_value; + + options.env = CurrentOptions().env; + + Reopen(options); + + // + // test table_cache access is "live" + // TableCacheCapacity originally comes from DBOptions::max_open_files + // and can vary by system. Get its current value. + ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kTableCacheCapacity, &value)); + + // now, change max_open_files to prove we are really accessing the value of + // interest + new_value = value / 2; + std::unordered_map new_options; + new_options.insert(std::pair( + "max_open_files", std::to_string(new_value))); + ASSERT_OK(db_->SetDBOptions(new_options)); + + // did the value we are reading update. NOTE: rocksdb internally reduces + // the value we pass by 10. + ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kTableCacheCapacity, &value)); + ASSERT_EQ(new_value - 10, value); + + // + // TableCacheUsage is a count of open .sst files. Force the creation of a + // a new table file. First add a record via Put(). Then force that + // record from write buffer to new .sst via Flush(). New .sst + // automatically opens and gets position in table cache ... raising usage + // count + ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kTableCacheUsage, &value)); + ASSERT_OK(Put("foo", "v1")); + ASSERT_OK(db_->Flush(FlushOptions())); + ASSERT_TRUE( + db_->GetIntProperty(DB::Properties::kTableCacheUsage, &new_value)); + ASSERT_EQ(new_value, value + 1); +} #endif // ROCKSDB_LITE } // namespace ROCKSDB_NAMESPACE diff --git a/db/internal_stats.cc b/db/internal_stats.cc index 512bc1b01..b1169a7ff 100644 --- a/db/internal_stats.cc +++ b/db/internal_stats.cc @@ -260,6 +260,8 @@ static const std::string estimate_oldest_key_time = "estimate-oldest-key-time"; static const std::string block_cache_capacity = "block-cache-capacity"; static const std::string block_cache_usage = "block-cache-usage"; static const std::string block_cache_pinned_usage = "block-cache-pinned-usage"; +static const std::string tablecache_capacity = "table-cache-capacity"; +static const std::string tablecache_usage = "table-cache-usage"; static const std::string options_statistics = "options-statistics"; const std::string DB::Properties::kNumFilesAtLevelPrefix = @@ -348,6 +350,10 @@ const std::string DB::Properties::kBlockCacheUsage = rocksdb_prefix + block_cache_usage; const std::string DB::Properties::kBlockCachePinnedUsage = rocksdb_prefix + block_cache_pinned_usage; +const std::string DB::Properties::kTableCacheCapacity = + rocksdb_prefix + tablecache_capacity; +const std::string DB::Properties::kTableCacheUsage = + rocksdb_prefix + tablecache_usage; const std::string DB::Properties::kOptionsStatistics = rocksdb_prefix + options_statistics; @@ -487,6 +493,12 @@ const std::unordered_map {DB::Properties::kBlockCachePinnedUsage, {false, nullptr, &InternalStats::HandleBlockCachePinnedUsage, nullptr, nullptr}}, + {DB::Properties::kTableCacheCapacity, + {false, nullptr, &InternalStats::HandleTableCacheCapacity, nullptr, + nullptr}}, + {DB::Properties::kTableCacheUsage, + {false, nullptr, &InternalStats::HandleTableCacheUsage, nullptr, + nullptr}}, {DB::Properties::kOptionsStatistics, {false, nullptr, nullptr, nullptr, &DBImpl::GetPropertyHandleOptionsStatistics}}, @@ -987,6 +999,18 @@ bool InternalStats::HandleBlockCachePinnedUsage(uint64_t* value, DBImpl* /*db*/, return true; } +bool InternalStats::HandleTableCacheCapacity(uint64_t* value, DBImpl* db, + Version* /*version*/) { + *value = static_cast(db->table_cache_->GetCapacity()); + return true; +} + +bool InternalStats::HandleTableCacheUsage(uint64_t* value, DBImpl* db, + Version* /*version*/) { + *value = static_cast(db->table_cache_->GetUsage()); + return true; +} + void InternalStats::DumpDBStats(std::string* value) { char buf[1000]; // DB-level stats, only available from default column family diff --git a/db/internal_stats.h b/db/internal_stats.h index 056719c5c..d63ef0a82 100644 --- a/db/internal_stats.h +++ b/db/internal_stats.h @@ -296,7 +296,7 @@ class InternalStats { this->num_dropped_records += c.num_dropped_records; this->count += c.count; int num_of_reasons = static_cast(CompactionReason::kNumOfReasons); - for (int i = 0; i< num_of_reasons; i++) { + for (int i = 0; i < num_of_reasons; i++) { counts[i] += c.counts[i]; } } @@ -437,8 +437,8 @@ class InternalStats { struct CFStatsSnapshot { // ColumnFamily-level stats CompactionStats comp_stats; - uint64_t ingest_bytes_flush; // Bytes written to L0 (Flush) - uint64_t stall_count; // Stall count + uint64_t ingest_bytes_flush; // Bytes written to L0 (Flush) + uint64_t stall_count; // Stall count // Stats from compaction jobs - bytes written, bytes read, duration. uint64_t compact_bytes_write; uint64_t compact_bytes_read; @@ -480,10 +480,10 @@ class InternalStats { struct DBStatsSnapshot { // DB-level stats - uint64_t ingest_bytes; // Bytes written by user - uint64_t wal_bytes; // Bytes written to WAL - uint64_t wal_synced; // Number of times WAL is synced - uint64_t write_with_wal; // Number of writes that request WAL + uint64_t ingest_bytes; // Bytes written by user + uint64_t wal_bytes; // Bytes written to WAL + uint64_t wal_synced; // Number of times WAL is synced + uint64_t write_with_wal; // Number of writes that request WAL // These count the number of writes processed by the calling thread or // another thread. uint64_t write_other; @@ -594,6 +594,8 @@ class InternalStats { bool HandleBlockCacheUsage(uint64_t* value, DBImpl* db, Version* version); bool HandleBlockCachePinnedUsage(uint64_t* value, DBImpl* db, Version* version); + bool HandleTableCacheCapacity(uint64_t* value, DBImpl* db, Version* version); + bool HandleTableCacheUsage(uint64_t* value, DBImpl* db, Version* version); // Total number of background errors encountered. Every time a flush task // or compaction task fails, this counter is incremented. The failure can // be caused by any possible reason, including file system errors, out of @@ -697,13 +699,14 @@ class InternalStats { return false; } - bool GetIntProperty(const DBPropertyInfo& /*property_info*/, uint64_t* /*value*/, - DBImpl* /*db*/) const { + bool GetIntProperty(const DBPropertyInfo& /*property_info*/, + uint64_t* /*value*/, DBImpl* /*db*/) const { return false; } bool GetIntPropertyOutOfMutex(const DBPropertyInfo& /*property_info*/, - Version* /*version*/, uint64_t* /*value*/) const { + Version* /*version*/, + uint64_t* /*value*/) const { return false; } }; diff --git a/include/rocksdb/db.h b/include/rocksdb/db.h index 995d9f0f1..9a4cd0a0d 100644 --- a/include/rocksdb/db.h +++ b/include/rocksdb/db.h @@ -923,6 +923,13 @@ class DB { // entries being pinned. static const std::string kBlockCachePinnedUsage; + // "rocksdb.table-cache-capacity" - returns table cache capacity. + static const std::string kTableCacheCapacity; + + // "rocksdb.table-cache-usage" - returns the memory size for the entries + // residing in table cache. + static const std::string kTableCacheUsage; + // "rocksdb.options-statistics" - returns multi-line string // of options.statistics static const std::string kOptionsStatistics; @@ -986,6 +993,8 @@ class DB { // "rocksdb.block-cache-capacity" // "rocksdb.block-cache-usage" // "rocksdb.block-cache-pinned-usage" + // "rocksdb.table-cache-capacity" + // "rocksdb.table-cache-usage" virtual bool GetIntProperty(ColumnFamilyHandle* column_family, const Slice& property, uint64_t* value) = 0; virtual bool GetIntProperty(const Slice& property, uint64_t* value) { From d924a50ca25bf69da59945a4a0bc6783dc5397c5 Mon Sep 17 00:00:00 2001 From: Matthew Von-Maszewski Date: Thu, 10 Jun 2021 13:28:34 -0400 Subject: [PATCH 23/25] Port Facebook PR8370 to Stardog: GetFreeSpace wrong if not root (#28) --- env/fs_posix.cc | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/env/fs_posix.cc b/env/fs_posix.cc index c38c62811..bd290a89b 100644 --- a/env/fs_posix.cc +++ b/env/fs_posix.cc @@ -870,7 +870,17 @@ class PosixFileSystem : public FileSystem { return IOError("While doing statvfs", fname, errno); } - *free_space = ((uint64_t)sbuf.f_bsize * sbuf.f_bfree); + // sbuf.bfree is total free space available to root + // sbuf.bavail is total free space available to unprivileged user + // sbuf.bavail <= sbuf.bfree ... pick correct based upon effective user id + if (geteuid()) { + // non-zero user is unprivileged, or -1 if error. take more conservative + // size + *free_space = ((uint64_t)sbuf.f_bsize * sbuf.f_bavail); + } else { + // root user can access all disk space + *free_space = ((uint64_t)sbuf.f_bsize * sbuf.f_bfree); + } return IOStatus::OK(); } From d20dd134509c65c09c2b89c9ca520d718819968e Mon Sep 17 00:00:00 2001 From: Matthew Von-Maszewski Date: Tue, 15 Jun 2021 13:59:08 -0400 Subject: [PATCH 24/25] Quick hack to remove table cache pinning while awaiting guidance from Facebook (#29) --- db/version_builder.cc | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/db/version_builder.cc b/db/version_builder.cc index 44229eefc..bece59686 100644 --- a/db/version_builder.cc +++ b/db/version_builder.cc @@ -969,11 +969,24 @@ class VersionBuilder::Rep { true /* record_read_stats */, internal_stats->GetFileReadHist(level), false, level, prefetch_index_and_filter_in_cache, max_file_size_for_l0_meta_pin); + + // the code here is attempting two things: + // 1. preload / warm the table cache with new file objects + // 2. create higher performance via a cache lookup avoidance + // The issue is that number 2 creates permanent objects in the + // table cache which over time are no longer useful. The code + // adjustment below keeps #1 and disables #2. +#if 0 if (file_meta->table_reader_handle != nullptr) { // Load table_reader file_meta->fd.table_reader = table_cache_->GetTableReaderFromHandle( file_meta->table_reader_handle); } +#else + table_cache_->ReleaseHandle(file_meta->table_reader_handle); + file_meta->table_reader_handle = nullptr; +#endif + } }); From c5f66de1109b552ce57bf406e34d3fd714c44c23 Mon Sep 17 00:00:00 2001 From: Matthew Von-Maszewski Date: Tue, 6 Jul 2021 15:19:34 -0400 Subject: [PATCH 25/25] BugFix: fix merge helper clock usage and rehack LoadTableHandlers (#30) --- db/merge_helper.cc | 4 +++- db/version_builder.cc | 19 ++++++++++++------- 2 files changed, 15 insertions(+), 8 deletions(-) diff --git a/db/merge_helper.cc b/db/merge_helper.cc index ed2646ea1..6c8fbdd70 100644 --- a/db/merge_helper.cc +++ b/db/merge_helper.cc @@ -414,7 +414,9 @@ CompactionFilter::Decision MergeHelper::FilterMerge(const Slice& user_key, kValueTypeForSeek); } } - total_filter_time_ += filter_timer_.ElapsedNanosSafe(); + if (stats_ != nullptr && ShouldReportDetailedTime(env_, stats_)) { + total_filter_time_ += filter_timer_.ElapsedNanosSafe(); + } return ret; } diff --git a/db/version_builder.cc b/db/version_builder.cc index bece59686..23478828b 100644 --- a/db/version_builder.cc +++ b/db/version_builder.cc @@ -904,6 +904,11 @@ class VersionBuilder::Rep { size_t table_cache_capacity = table_cache_->get_cache()->GetCapacity(); bool always_load = (table_cache_capacity == TableCache::kInfiniteCapacity); size_t max_load = port::kMaxSizet; +#ifndef NDEBUG + bool debug_override = true; // to enable CompactedDB related tests and some property tests +#else + bool debug_override = false; +#endif if (!always_load) { // If it is initial loading and not set to always loading all the @@ -976,16 +981,16 @@ class VersionBuilder::Rep { // The issue is that number 2 creates permanent objects in the // table cache which over time are no longer useful. The code // adjustment below keeps #1 and disables #2. -#if 0 if (file_meta->table_reader_handle != nullptr) { // Load table_reader - file_meta->fd.table_reader = table_cache_->GetTableReaderFromHandle( - file_meta->table_reader_handle); + if (always_load || debug_override) { + file_meta->fd.table_reader = table_cache_->GetTableReaderFromHandle( + file_meta->table_reader_handle); + } else { + table_cache_->ReleaseHandle(file_meta->table_reader_handle); + file_meta->table_reader_handle = nullptr; + } // else } -#else - table_cache_->ReleaseHandle(file_meta->table_reader_handle); - file_meta->table_reader_handle = nullptr; -#endif } });