From 98fd96559f209be4c79e86a908d967f7a29a3a10 Mon Sep 17 00:00:00 2001 From: gitlost Date: Fri, 13 Oct 2017 00:22:39 +0100 Subject: [PATCH] Lessen context duplication in db search. --- features/db-search.feature | 40 +++++++++++++++++++++++++++++++-- src/DB_Command.php | 46 ++++++++++++++++++++++++++++---------- 2 files changed, 72 insertions(+), 14 deletions(-) diff --git a/features/db-search.feature b/features/db-search.feature index 26c333f1e..fa1fba24b 100644 --- a/features/db-search.feature +++ b/features/db-search.feature @@ -956,7 +956,7 @@ Feature: Search through the database """ And STDOUT should contain: """ - :1234_XYXYX_2345678_X [...] X_2345678_XYXYX_234567890 [...] 345678901_XYXYX_2345 + :1234_XYXYX_2345678_XYXYX_234567890 [...] 345678901_XYXYX_2345 """ And STDERR should be empty @@ -967,6 +967,42 @@ Feature: Search through the database """ And STDOUT should contain: """ - :1234_XYXYX_2345678_X [...] X_2345678_XYXYX_234567890 [...] 345678901_XYXYX_2345 + :1234_XYXYX_2345678_XYXYX_234567890 [...] 345678901_XYXYX_2345 + """ + And STDERR should be empty + + Scenario: Search with large data + Given a WP install + # Note "_utf8 X'CC88'" is combining umlaut. Doing it this way as non-ASCII stuff gets stripped due to (eventually) been put thru `escapeshellarg()` with a default C locale. + # Also restricted by MySQL's default `max_allowed_packet` size to 16 MB - 1 (0xFFFFFF). + And I run `wp db query "INSERT INTO wp_options (option_name, option_value) VALUES ('opt_large', CONCAT(REPEAT('a', 1024 * 1024 * 16 - 9), 'o', _utf8 X'CC88', 'XYXYX'));"` + + When I run `wp db search XYXYX --before_context=1 --stats` + Then STDOUT should contain: + """ + Success: Found 1 match + """ + And STDOUT should contain: + """ + :öXYXYX + """ + And STDOUT should not contain: + """ + :aöXYXYX + """ + And STDERR should be empty + + When I run `wp db search XYXYX --regex --before_context=1 --stats` + Then STDOUT should contain: + """ + Success: Found 1 match + """ + And STDOUT should contain: + """ + :öXYXYX + """ + And STDOUT should not contain: + """ + :aöXYXYX """ And STDERR should be empty diff --git a/src/DB_Command.php b/src/DB_Command.php index c63c87f29..0332d8f49 100644 --- a/src/DB_Command.php +++ b/src/DB_Command.php @@ -905,21 +905,43 @@ public function search( $args, $assoc_args ) { $bits = array(); $col_encoding = $encoding; - if ( null === $col_encoding ) { - $col_encoding = false; - if ( ( $before_context || $after_context ) && function_exists( 'mb_detect_encoding' ) ) { - $col_encoding = mb_detect_encoding( $col_val, null, true /*strict*/ ); - } + if ( ! $col_encoding && ( $before_context || $after_context ) && function_exists( 'mb_detect_encoding' ) ) { + $col_encoding = mb_detect_encoding( $col_val, null, true /*strict*/ ); } - foreach ( $matches[0] as $match_arr ) { - $match = $match_arr[0]; - $offset = $match_arr[1]; + $append_next = false; + $last_offset = 0; + $match_cnt = count( $matches[0] ); + for ( $i = 0; $i < $match_cnt; $i++ ) { + $match = $matches[0][ $i ][0]; + $offset = $matches[0][ $i ][1]; + $log = $colors['match'][0] . $match . $colors['match'][1]; + $before = $after = ''; + $after_shortened = false; + // Offsets are in bytes, so need to use `strlen()` and `substr()` before using `safe_substr()`. - $before = $before_context && $offset ? \cli\safe_substr( substr( $col_val, 0, $offset ), -$before_context, null /*length*/, false /*is_width*/, $col_encoding ) : ''; - $after = $after_context ? \cli\safe_substr( substr( $col_val, $offset + strlen( $match ) ), 0, $after_context, false /*is_width*/, $col_encoding ) : ''; - $bits[] = $before . $colors['match'][0] . $match . $colors['match'][1] . $after; + if ( $before_context && $offset && ! $append_next ) { + $before = \cli\safe_substr( substr( $col_val, $last_offset, $offset - $last_offset ), -$before_context, null /*length*/, false /*is_width*/, $col_encoding ); + } + if ( $after_context ) { + $end_offset = $offset + strlen( $match ); + $after = \cli\safe_substr( substr( $col_val, $end_offset ), 0, $after_context, false /*is_width*/, $col_encoding ); + // To lessen context duplication in output, shorten the after context if it overlaps with the next match. + if ( $i + 1 < $match_cnt && $end_offset + strlen( $after ) > $matches[0][ $i + 1 ][1] ) { + $after = substr( $after, 0, $matches[0][ $i + 1 ][1] - $end_offset ); + $after_shortened = true; + // On the next iteration, will append with no before context. + } + } + if ( $append_next ) { + $cnt = count( $bits ); + $bits[ $cnt - 1 ] .= $log . $after; + } else { + $bits[] = $before . $log . $after; + } + $append_next = $after_shortened; + $last_offset = $offset; } - $match_count += count( $bits ); + $match_count += $match_cnt; $col_val = implode( ' [...] ', $bits ); WP_CLI::log( $matches_only ? $col_val : ( $one_line ? "{$table_column_val}:{$pk_val}{$col_val}" : "{$pk_val}{$col_val}" ) );