From 98fd96559f209be4c79e86a908d967f7a29a3a10 Mon Sep 17 00:00:00 2001
From: gitlost <burmartke@gmail.com>
Date: Fri, 13 Oct 2017 00:22:39 +0100
Subject: [PATCH] Lessen context duplication in db search.

---
 features/db-search.feature | 40 +++++++++++++++++++++++++++++++--
 src/DB_Command.php         | 46 ++++++++++++++++++++++++++++----------
 2 files changed, 72 insertions(+), 14 deletions(-)

diff --git a/features/db-search.feature b/features/db-search.feature
index 26c333f1e..fa1fba24b 100644
--- a/features/db-search.feature
+++ b/features/db-search.feature
@@ -956,7 +956,7 @@ Feature: Search through the database
       """
     And STDOUT should contain:
       """
-      :1234_XYXYX_2345678_X [...] X_2345678_XYXYX_234567890 [...] 345678901_XYXYX_2345
+      :1234_XYXYX_2345678_XYXYX_234567890 [...] 345678901_XYXYX_2345
       """
     And STDERR should be empty
 
@@ -967,6 +967,42 @@ Feature: Search through the database
       """
     And STDOUT should contain:
       """
-      :1234_XYXYX_2345678_X [...] X_2345678_XYXYX_234567890 [...] 345678901_XYXYX_2345
+      :1234_XYXYX_2345678_XYXYX_234567890 [...] 345678901_XYXYX_2345
+      """
+    And STDERR should be empty
+
+  Scenario: Search with large data
+    Given a WP install
+    # Note "_utf8 X'CC88'" is combining umlaut. Doing it this way as non-ASCII stuff gets stripped due to (eventually) been put thru `escapeshellarg()` with a default C locale.
+    # Also restricted by MySQL's default `max_allowed_packet` size to 16 MB - 1 (0xFFFFFF).
+    And I run `wp db query "INSERT INTO wp_options (option_name, option_value) VALUES ('opt_large', CONCAT(REPEAT('a', 1024 * 1024 * 16 - 9), 'o', _utf8 X'CC88', 'XYXYX'));"`
+
+    When I run `wp db search XYXYX --before_context=1 --stats`
+    Then STDOUT should contain:
+      """
+      Success: Found 1 match
+      """
+    And STDOUT should contain:
+      """
+      :öXYXYX
+      """
+    And STDOUT should not contain:
+      """
+      :aöXYXYX
+      """
+    And STDERR should be empty
+
+    When I run `wp db search XYXYX --regex --before_context=1 --stats`
+    Then STDOUT should contain:
+      """
+      Success: Found 1 match
+      """
+    And STDOUT should contain:
+      """
+      :öXYXYX
+      """
+    And STDOUT should not contain:
+      """
+      :aöXYXYX
       """
     And STDERR should be empty
diff --git a/src/DB_Command.php b/src/DB_Command.php
index c63c87f29..0332d8f49 100644
--- a/src/DB_Command.php
+++ b/src/DB_Command.php
@@ -905,21 +905,43 @@ public function search( $args, $assoc_args ) {
 
 							$bits = array();
 							$col_encoding = $encoding;
-							if ( null === $col_encoding ) {
-								$col_encoding = false;
-								if ( ( $before_context || $after_context ) && function_exists( 'mb_detect_encoding' ) ) {
-									$col_encoding = mb_detect_encoding( $col_val, null, true /*strict*/ );
-								}
+							if ( ! $col_encoding && ( $before_context || $after_context ) && function_exists( 'mb_detect_encoding' ) ) {
+								$col_encoding = mb_detect_encoding( $col_val, null, true /*strict*/ );
 							}
-							foreach ( $matches[0] as $match_arr ) {
-								$match = $match_arr[0];
-								$offset = $match_arr[1];
+							$append_next = false;
+							$last_offset = 0;
+							$match_cnt = count( $matches[0] );
+							for ( $i = 0; $i < $match_cnt; $i++ ) {
+								$match = $matches[0][ $i ][0];
+								$offset = $matches[0][ $i ][1];
+								$log = $colors['match'][0] . $match . $colors['match'][1];
+								$before = $after = '';
+								$after_shortened = false;
+
 								// Offsets are in bytes, so need to use `strlen()` and `substr()` before using `safe_substr()`.
-								$before = $before_context && $offset ? \cli\safe_substr( substr( $col_val, 0, $offset ), -$before_context, null /*length*/, false /*is_width*/, $col_encoding ) : '';
-								$after = $after_context ? \cli\safe_substr( substr( $col_val, $offset + strlen( $match ) ), 0, $after_context, false /*is_width*/, $col_encoding ) : '';
-								$bits[] = $before . $colors['match'][0] . $match . $colors['match'][1] . $after;
+								if ( $before_context && $offset && ! $append_next ) {
+									$before = \cli\safe_substr( substr( $col_val, $last_offset, $offset - $last_offset ), -$before_context, null /*length*/, false /*is_width*/, $col_encoding );
+								}
+								if ( $after_context ) {
+									$end_offset = $offset + strlen( $match );
+									$after = \cli\safe_substr( substr( $col_val, $end_offset ), 0, $after_context, false /*is_width*/, $col_encoding );
+									// To lessen context duplication in output, shorten the after context if it overlaps with the next match.
+									if ( $i + 1 < $match_cnt && $end_offset + strlen( $after ) > $matches[0][ $i + 1 ][1] ) {
+										$after = substr( $after, 0, $matches[0][ $i + 1 ][1] - $end_offset );
+										$after_shortened = true;
+										// On the next iteration, will append with no before context.
+									}
+								}
+								if ( $append_next ) {
+									$cnt = count( $bits );
+									$bits[ $cnt - 1 ] .= $log . $after;
+								} else {
+									$bits[] = $before . $log . $after;
+								}
+								$append_next = $after_shortened;
+								$last_offset = $offset;
 							}
-							$match_count += count( $bits );
+							$match_count += $match_cnt;
 							$col_val = implode( ' [...] ', $bits );
 
 							WP_CLI::log( $matches_only ? $col_val : ( $one_line ? "{$table_column_val}:{$pk_val}{$col_val}" : "{$pk_val}{$col_val}" ) );