Merge pull request #74 from bobmatyas/update/020125

Update/020125
bobmatyas · Feb 1, 2025 · de37693 · de37693
2 parents 1ff0ee1 + 76658e9
commit de37693
Show file tree

Hide file tree

Showing 3 changed files with 31 additions and 11 deletions.
diff --git a/block-ai-crawlers.php b/block-ai-crawlers.php
@@ -5,7 +5,7 @@
  * Author:          Bob Matyas
  * Author URI:      https://www.bobmatyas.com
  * Text Domain:     block-ai-crawlers
- * Version:         1.4.2
+ * Version:         1.4.3
  * License:         GPL-2.0-or-later
  * License URI:     https://www.gnu.org/licenses/gpl-2.0.html
  *
@@ -39,6 +39,8 @@ function block_ai_robots_txt( $robots ) {
 		$robots .= "User-agent: ClaudeBot\n";
 		$robots .= "User-agent: Claude-Web\n";
 		$robots .= "User-agent: cohere-ai\n";
+		$robots .= "User-agent: cohere-training-data-crawler\n";
+		$robots .= "User-agent: Crawlspace\n";
 		$robots .= "User-agent: Diffbot\n";
 		$robots .= "User-agent: FacebookBot\n";
 		$robots .= "User-agent: FriendlyCrawler\n";
@@ -55,6 +57,9 @@ function block_ai_robots_txt( $robots ) {
 		$robots .= "User-agent: PetalBot\n";
 		$robots .= "User-agent: PerplexityBot\n";
 		$robots .= "User-agent: Scrapy\n";
+		$robots .= "User-agent: SemrushBot\n";
+		$robots .= "User-agent: SemrushBot-OCOB\n";
+		$robots .= "User-agent: SemrushBot-FT\n";
 		$robots .= "User-agent: SentiBot\n";
 		$robots .= "User-agent: sentibot\n";
 		$robots .= "User-agent: Timpibot\n";
@@ -132,8 +137,8 @@ function block_ai_append_plugin_rating( $links_array, $plugin_file_name ) {
 		. '.rate-stars svg {fill:' . $stars_color . ';}'
 		. '</style>';
 	}
- 
+
 	return $links_array;
 }
 
-add_filter( 'plugin_row_meta', 'block_ai_append_plugin_rating', 10, 4 );
+add_filter( 'plugin_row_meta', 'block_ai_append_plugin_rating', 10, 4 );
diff --git a/inc/settings-html.php b/inc/settings-html.php
@@ -47,11 +47,6 @@
 								<td><p>Used by TikTok for AI training.</p></td>
 								<td><a href="https://darkvisitors.com/agents/bytespider" target=_blank>More Info <span class="dashicons dashicons-external link"></span></a></td>
 							</tr>
-							<tr>
-								<th>Cohere</th>
-								<td><p>Used by Cohere to scrape data for AI training.</p></td>
-								<td><a href="https://darkvisitors.com/agents/cohere-ai" target=_blank>More Info <span class="dashicons dashicons-external link"></span></a></td>
-							</tr>
 							<tr>
 								<th>ChatGPT</th>
 								<td><p>Used by OpenAI to power ChatGPT.</p></td>
@@ -62,11 +57,21 @@
 								<td><p>Used by Anthropic's Claude.</p></td>
 								<td><a href="https://support.anthropic.com/en/articles/8896518-does-anthropic-crawl-data-from-the-web-and-how-can-site-owners-block-the-crawler" target=_blank>More Info <span class="dashicons dashicons-external link"></span></a></td>
 							</tr>
+							<tr>
+								<th>Cohere and cohere-training-data-crawler</th>
+								<td><p>Used by Cohere to scrape data for AI training.</p></td>
+								<td><a href="https://cohere.com/about" target=_blank>More Info <span class="dashicons dashicons-external link"></span></a></td>
+							</tr>
 							<tr>
 								<th>CommonCrawl</th>
 								<td><p>Compiles datasets used to train AI models.</p></td>
 								<td><a href="https://commoncrawl.org/big-picture/frequently-asked-questions/" target=_blank>More Info <span class="dashicons dashicons-external link"></span></a></td>
 							</tr>
+							<tr>
+								<th>Crawlspace</th>
+								<td><p>A web scraper that can be used to extract data for AI training.</p></td>
+								<td><a href="https://crawlspace.dev/" target=_blank>More Info <span class="dashicons dashicons-external link"></span></a></td>
+							</tr>
 							<tr>
 								<th>Diffbot</th>
 								<td><p>Used by Diffbot to scrape data for AI training.</p></td>
@@ -128,10 +133,15 @@
 								<td><a href="https://scrapy.org/" target=_blank>More Info <span class="dashicons dashicons-external link"></span></a></td>
 							</tr>
 							<tr>
-								<th>SentiBot</th>
-								<td><p>Blocks SentiOne's AI-powered social media listening and analysis tools.</p></td>
+								<th>Scrapy</th>
+								<td><p>Blocks the Scrapy bot (used for scraping websites).</p></td>
 								<td><a href="https://scrapy.org/" target=_blank>More Info <span class="dashicons dashicons-external link"></span></a></td>
 							</tr>
+							<tr>
+								<th>SemrushBot</th>
+								<td><p>Blocks the Semrush bot used to pull data into the Semrush platform. Data is used for their ContentShake AI tool.</p></td>
+								<td><a href="https://www.semrush.com/bot/" target=_blank>More Info <span class="dashicons dashicons-external link"></span></a></td>
+							</tr>
 							<tr>
 								<th>Timpibot</th>
 								<td><p>Used by Timpi; likely for their Wilson AI Product.</p></td>

diff --git a/readme.txt b/readme.txt
@@ -4,7 +4,7 @@ Tags: ai, robots.txt, chatgpt, crawlers
 Requires at least: 5.6
 Tested up to: 6.7
 Requires PHP: 7.4
-Stable tag: 1.4.2
+Stable tag: 1.4.3
 License: GPLv2 or later
 License URI: https://www.gnu.org/licenses/gpl-2.0.html
 
@@ -95,6 +95,11 @@ No. Search engines follow different `robots.txt` rules.
 
 == Changelog ==
 
+= 1.4.3 =
+- New: Block SemrushBot
+- New: Block SemrushBot
+- New: Block Crawlspace
+
 = 1.4.2 =
 - New: Block PanguBot