From 5fc2e2be7fe2fb8ba7765e24ab623d3e6993b72f Mon Sep 17 00:00:00 2001 From: Viktor Kurilko Date: Fri, 13 Oct 2023 12:13:37 +0300 Subject: [PATCH 01/22] Fix deadlock in case of the bottleneck HashJoin with shared scan below (#620) Disabling the prefetch_inner mechanism for HashJoin, when the join is on the bottleneck, could lead to a deadlock if there is a SharedScan in both parts of the join. The deadlock occured because SharedScan producer in the inner part of the join, and consumer in the outer. Thus consumer couldn't perform the shared scan without prefetching the inner part. The SharedScan producer for HashJoin is always created under the inner part, because shareinput_walker function always processes HashJoin node from the Hash part. For CTEs the prefetching was enabled for all cases, except the bottleneck join, because CTE path is marked as motionHazard. Therefore, if prefetch_inner is disabled, the executor can start performing join from the outer part while the SharedScan producer will be inside the inner part and this will probably lead to the deadlock if the SharedScan is cross-slice. This patch fixes this by enabling prefetch_inner for HashJoin on bottleneck when traversing the plan in shareinput_mutator_dag_to_tree if there are shared scans in the plan and join is on the bottleneck. --- src/backend/cdb/cdbmutate.c | 17 +++++++++++++++++ src/test/regress/expected/with.out | 19 +++++++++++++++++++ src/test/regress/sql/with.sql | 18 ++++++++++++++++++ 3 files changed, 54 insertions(+) diff --git a/src/backend/cdb/cdbmutate.c b/src/backend/cdb/cdbmutate.c index fedee52c7f00..21da452b71a2 100644 --- a/src/backend/cdb/cdbmutate.c +++ b/src/backend/cdb/cdbmutate.c @@ -2086,7 +2086,24 @@ shareinput_mutator_dag_to_tree(Node *node, PlannerInfo *root, bool fPop) Plan *plan = (Plan *) node; if (fPop) + { + /* + * If there is a shared scan under HashJoin, producer is in the inner + * subplan and consumer is in the outer, then we need to enable + * prefetch_inner to avoid deadlock. The deadlock can occur when + * prefetch_inner is disabled and the executor can start performing the + * join from the outer part. In the case of bottleneck, we can turn off + * prefetch_inner for optimization reasons in the create_join_plan + * function. So we need to turn it back on. To simplify the logic, we + * enable prefetch_inner for all hash joins located on bottleneck if + * there is a shared scan in the plan. + */ + if (IsA(plan, HashJoin) && ctxt->producer_count > 0 && + CdbPathLocus_IsBottleneck(*(plan->flow))) + ((Join*) plan)->prefetch_inner = true; + return true; + } if (IsA(plan, ShareInputScan)) { diff --git a/src/test/regress/expected/with.out b/src/test/regress/expected/with.out index d02090be3412..4f449766740e 100644 --- a/src/test/regress/expected/with.out +++ b/src/test/regress/expected/with.out @@ -2261,3 +2261,22 @@ SELECT * FROM cte1 a JOIN cte1 b USING (c1); (1 row) DROP TABLE t1; +-- Ensure that prefetch is not disabled for HashJoin in case of join at single segment. +-- Test Shared Scan producer is executed under inner part of join first and the +-- deadlock between Shared Scans does not occur +SET optimizer = off; +--start_ignore +DROP TABLE IF EXISTS d; +--end_ignore +CREATE TABLE d (c1 int, c2 int) DISTRIBUTED BY (c1); +INSERT INTO d VALUES ( 2, 0 ),( 2, 0 ); +WITH cte AS ( + SELECT count(*) c1 FROM d +) SELECT * FROM cte a JOIN (SELECT * FROM d JOIN cte USING (c1) LIMIT 1) b USING (c1); + c1 | c2 +----+---- + 2 | 0 +(1 row) + +RESET optimizer; +DROP TABLE d; diff --git a/src/test/regress/sql/with.sql b/src/test/regress/sql/with.sql index b8e60a346035..72b89a0f0c97 100644 --- a/src/test/regress/sql/with.sql +++ b/src/test/regress/sql/with.sql @@ -1065,3 +1065,21 @@ WITH cte1 AS ( SELECT * FROM cte1 a JOIN cte1 b USING (c1); DROP TABLE t1; + +-- Ensure that prefetch is not disabled for HashJoin in case of join at single segment. +-- Test Shared Scan producer is executed under inner part of join first and the +-- deadlock between Shared Scans does not occur +SET optimizer = off; +--start_ignore +DROP TABLE IF EXISTS d; +--end_ignore +CREATE TABLE d (c1 int, c2 int) DISTRIBUTED BY (c1); + +INSERT INTO d VALUES ( 2, 0 ),( 2, 0 ); + +WITH cte AS ( + SELECT count(*) c1 FROM d +) SELECT * FROM cte a JOIN (SELECT * FROM d JOIN cte USING (c1) LIMIT 1) b USING (c1); + +RESET optimizer; +DROP TABLE d; From e971286894929db3e071aa87198ae97905e2f14d Mon Sep 17 00:00:00 2001 From: Alexandr Barulev Date: Thu, 19 Oct 2023 06:30:06 +0300 Subject: [PATCH 02/22] ADBDEV-4349: gplogfilter: fix timerange validation (#627) Previously the time range filtering of log files was incorrect: there was a check for belonging a log file to specified time range which worked like a binary `AND` (if time < begin AND time > end - skip file), but something like binary `OR` is needed here. Thus file filtering never worked. Also this validation was executed after the file was opened, so in case of failed validation there was a redundant `open/close` operations. This patch fixes such behavior, by: - moving the logic of belonging log file name the user specified range to the separate function (it's also helps to cover corner cases) - skip file processing, if it doesn't belong to range, at the begin of loop to prevent redundant `open/close` operations --- gpMgmt/bin/gplogfilter | 63 ++++----- gpMgmt/bin/gppylib/logfilter.py | 121 ++++++++++++++++ .../gppylib/test/unit/test_unit_logfilter.py | 132 ++++++++++++++++++ gpMgmt/test/behave/mgmt_utils/environment.py | 9 +- .../behave/mgmt_utils/gplogfilter.feature | 67 +++++++++ .../behave/mgmt_utils/steps/gplogfilter.py | 43 ++++++ 6 files changed, 395 insertions(+), 40 deletions(-) create mode 100644 gpMgmt/bin/gppylib/test/unit/test_unit_logfilter.py create mode 100644 gpMgmt/test/behave/mgmt_utils/gplogfilter.feature create mode 100644 gpMgmt/test/behave/mgmt_utils/steps/gplogfilter.py diff --git a/gpMgmt/bin/gplogfilter b/gpMgmt/bin/gplogfilter index 09f36cf2f162..f57807f8a1e8 100755 --- a/gpMgmt/bin/gplogfilter +++ b/gpMgmt/bin/gplogfilter @@ -361,6 +361,12 @@ if (options.zip is None and options.zip = '9' try: + if (begin and end) and begin >= end: + raise IOError( + 'Invalid arguments: "begin" date (%s) is >= "end" date (%s)' + % (begin, end) + ) + # If no inputfile arg, try MASTER_DATA_DIRECTORY environment variable if len(args) == 0: s = os.getenv('MASTER_DATA_DIRECTORY') @@ -400,44 +406,27 @@ try: % (begin or 'beginning of data', end or 'end of data')) print >> sys.stderr, msg - # Loop over input files - for ifn in args: - """ - Open each file in the logs directory. Check to see if the file name - looks anything like a log file name with a time stamp that we - recognize. If true, and the user specified a time range, skip the - file if it is outside the range. That is, close the file and any - associated temporary files. - - All other files with names that do not look like time stamps are - processed. That is, their log information is extracted, and if - the user specified a time range, only those entries that are - within that range are kept. - """ + # transform given array of log names (args array) into corresponding + # array of LogNameInfo structures. Final array is concatenation of + # LogNameInfo arrays with and without timestamps in log names. + # Both arrays are ordered: + # - first part is ordered by the time stamp of log name and after by + # name (in case of time stamp equality) + # - second part is ordered by name and lays out at the end + # All LogNameInfo structures are marked for belonging to the user + # specified time range. Log names without time stamp always should + # be processed, because them may contain log entries inside specified + # range, so LogNameInfo for such names always marked as belonging to + # the range. + logsInfoArr = getLogInfoArrayByNamesOrderedAndMarkedInTSRange(args, begin, end) + + for logInfo in logsInfoArr: + if not logInfo.belongsToTimeRangeFilter: + print >> sys.stderr, "SKIP file: %s" % logInfo.name + continue + # Open next input file - fileIn, inputFilesToClose, ifn, zname = openInputFile(ifn, options) - - # if we can skip the whole file, let's do so - if zname.startswith('gpdb') and zname.endswith('.csv'): - goodFormat = True - try: - # try format YYYY-MM-DD_HHMMSS - filedate = datetime.strptime(zname[5:-4], '%Y-%m-%d_%H%M%S') - except: - try: - # try format YYYY-MM-DD - filedate = datetime.strptime(zname[5:-4], '%Y-%m-%d') - except: - # the format isn't anything I understand - goodFormat = False - - if goodFormat and begin and filedate < begin: - if end and filedate > end: - print >> sys.stderr, "SKIP file: %s" % zname - for f in inputFilesToClose: - f.close() - inputFilesToClose = [] - continue + fileIn, inputFilesToClose, ifn, zname = openInputFile(logInfo.name, options) # Announce each input file *before* its output file if --out is dir if options.verbose and outputFilePerInputFile: diff --git a/gpMgmt/bin/gppylib/logfilter.py b/gpMgmt/bin/gppylib/logfilter.py index 6bcbcfd28267..d3eb86263641 100644 --- a/gpMgmt/bin/gppylib/logfilter.py +++ b/gpMgmt/bin/gppylib/logfilter.py @@ -57,6 +57,14 @@ # A timezone specifier may follow the timestamp, but we ignore that. +# This pattern matches the date and time stamp at the log file name of a +# GPDB log file. The timestamp format is: YYYY-MM-DD_HHMMSS or the +# YYYY-MM-DD (to preserve an old behaviour). +logNameTSPattern = re.compile( + '^.*gpdb-(?P\d+-\d+-\d+(?P