From 91d2566b1b6311c5764d0f391358079ab39bad5b Mon Sep 17 00:00:00 2001
From: user12986714 <65436504+user12986714@users.noreply.github.com>
Date: Tue, 1 Sep 2020 23:59:06 -0400
Subject: [PATCH 01/10] Add reason newly posted youtube video

---
 findspam.py | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/findspam.py b/findspam.py
index 3bb2a7dafd..c91b3b0be7 100644
--- a/findspam.py
+++ b/findspam.py
@@ -11,6 +11,7 @@
 import time
 import os
 import os.path as path
+import requests
 
 import regex
 # noinspection PyPackageRequirements
@@ -44,6 +45,7 @@
 PUNCTUATION_RATIO = 0.42
 REPEATED_CHARACTER_RATIO = 0.20
 IMG_TXT_R_THRES = 0.7
+OLD_VIDEO_THRES = 5
 EXCEPTION_RE = r"^Domain (.*) didn't .*!$"
 RE_COMPILE = regex.compile(EXCEPTION_RE)
 COMMON_MALFORMED_PROTOCOLS = [
@@ -622,6 +624,25 @@ def mostly_img(s, site):
     return False, ""
 
 
+@create_rule("Newly posted youtube video")
+def new_video(s, site):
+    youtube_links = regex.findall(r"https:\/\/youtu\.be\/[a-zA-Z0-9]*+", s)
+    for link in youtube_links:
+        try:
+            resp = Requests.get(link).text
+            date = regex.findall(r'"dateText":{"simpleText":"(Jan|Feb|Mar|Apr|May|Jun|' +
+                                 'Jul|Aug|Sep|Oct|Nov|Dec)[a-z]? (\d++), (\d++)"}', resp)
+            if len(date) == 1:
+                # This condition should always be true, but it is here just in case
+                date = date[0]
+                now = datetime.now()
+                if now.year == int(date[2]) and now.strftime("%b") == date[0]:
+                    if now.day <= int(date[1]) + OLD_VIDEO_THRES:
+                        return True, "Video is posted on {} {}, {}".format(date[0], date[1], date[2])
+        except Exception:
+            return False, ""
+    return False, ""
+
 # noinspection PyUnusedLocal,PyMissingTypeHints
 @create_rule("repeating characters in {}", stripcodeblocks=True, max_rep=10000, max_score=10000)
 def has_repeating_characters(s, site):

From 27dc15a3320c64aef26fc7ec8d90a3b577a137af Mon Sep 17 00:00:00 2001
From: user12986714 <65436504+user12986714@users.noreply.github.com>
Date: Wed, 2 Sep 2020 00:00:48 -0400
Subject: [PATCH 02/10] Fix CI

---
 findspam.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/findspam.py b/findspam.py
index c91b3b0be7..bb9cb89b5c 100644
--- a/findspam.py
+++ b/findspam.py
@@ -643,6 +643,7 @@ def new_video(s, site):
             return False, ""
     return False, ""
 
+
 # noinspection PyUnusedLocal,PyMissingTypeHints
 @create_rule("repeating characters in {}", stripcodeblocks=True, max_rep=10000, max_score=10000)
 def has_repeating_characters(s, site):

From 958881afe3c3b02be069921f9239aa9e732875c6 Mon Sep 17 00:00:00 2001
From: user12986714 <65436504+user12986714@users.noreply.github.com>
Date: Wed, 2 Sep 2020 00:09:39 -0400
Subject: [PATCH 03/10] Fix CI... again

---
 findspam.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/findspam.py b/findspam.py
index bb9cb89b5c..020abeb7ec 100644
--- a/findspam.py
+++ b/findspam.py
@@ -629,9 +629,9 @@ def new_video(s, site):
     youtube_links = regex.findall(r"https:\/\/youtu\.be\/[a-zA-Z0-9]*+", s)
     for link in youtube_links:
         try:
-            resp = Requests.get(link).text
+            resp = requests.get(link).text
             date = regex.findall(r'"dateText":{"simpleText":"(Jan|Feb|Mar|Apr|May|Jun|' +
-                                 'Jul|Aug|Sep|Oct|Nov|Dec)[a-z]? (\d++), (\d++)"}', resp)
+                                 r'Jul|Aug|Sep|Oct|Nov|Dec)[a-z]? (\d++), (\d++)"}', resp)
             if len(date) == 1:
                 # This condition should always be true, but it is here just in case
                 date = date[0]

From 308118c2492f25d01d040d7857a9b940dedb550f Mon Sep 17 00:00:00 2001
From: user12986714 <65436504+user12986714@users.noreply.github.com>
Date: Wed, 2 Sep 2020 00:13:54 -0400
Subject: [PATCH 04/10] Not to double import requests package

---
 findspam.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/findspam.py b/findspam.py
index 020abeb7ec..f47d52a538 100644
--- a/findspam.py
+++ b/findspam.py
@@ -11,7 +11,6 @@
 import time
 import os
 import os.path as path
-import requests
 
 import regex
 # noinspection PyPackageRequirements

From 1fd725d05398bf4815ae37aa4c9546cbf0f76582 Mon Sep 17 00:00:00 2001
From: user12986714 <65436504+user12986714@users.noreply.github.com>
Date: Wed, 2 Sep 2020 10:31:46 -0400
Subject: [PATCH 05/10] Better regex

---
 findspam.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/findspam.py b/findspam.py
index f47d52a538..1240784888 100644
--- a/findspam.py
+++ b/findspam.py
@@ -625,7 +625,10 @@ def mostly_img(s, site):
 
 @create_rule("Newly posted youtube video")
 def new_video(s, site):
-    youtube_links = regex.findall(r"https:\/\/youtu\.be\/[a-zA-Z0-9]*+", s)
+    # Youtube ID regex is by brunodles, https://stackoverflow.com/a/31711517
+    youtube_ids = regex.findall(r"(?:https?:\/\/)?(?:www\.)?youtu\.?be(?:\.com)?\/" +
+                                r"?.*(?:watch|embed)?(?:.*v=|v\/|\/)([\w\-_]+)\&?", s)
+    youtube_links = ["https://" + x for x in youtube_ids]
     for link in youtube_links:
         try:
             resp = requests.get(link).text

From b3d60152fa6752904bc9256f9f21bd34a801e1d5 Mon Sep 17 00:00:00 2001
From: user12986714 <65436504+user12986714@users.noreply.github.com>
Date: Wed, 2 Sep 2020 17:43:45 -0400
Subject: [PATCH 06/10] Break out scrap_and_check()

---
 findspam.py | 37 ++++++++++++++++++++++---------------
 1 file changed, 22 insertions(+), 15 deletions(-)

diff --git a/findspam.py b/findspam.py
index 1240784888..107310ad26 100644
--- a/findspam.py
+++ b/findspam.py
@@ -623,27 +623,34 @@ def mostly_img(s, site):
     return False, ""
 
 
+def is_recent(date, now, thres):
+    return now.year == int(date[2]) and now.strftime("%b") == date[0] and now.day <= int(date[1]) + thres
+
+
+def scrap_and_check(url_list, date_regex, thres, thing):
+    now = datetime.now()
+    for link in url_list:
+        try:
+            resp = requests.get(link).text
+            date = regex.findall(date_regex, resp)
+            if len(date) == 1 and is_recent(date[0], now, thres):
+                return True, "{} is posted on {} {}, {}".format(thing, date[0], date[1], date[2])
+        except Exception:
+            pass
+    return False, ""
+
+
 @create_rule("Newly posted youtube video")
 def new_video(s, site):
     # Youtube ID regex is by brunodles, https://stackoverflow.com/a/31711517
     youtube_ids = regex.findall(r"(?:https?:\/\/)?(?:www\.)?youtu\.?be(?:\.com)?\/" +
                                 r"?.*(?:watch|embed)?(?:.*v=|v\/|\/)([\w\-_]+)\&?", s)
     youtube_links = ["https://" + x for x in youtube_ids]
-    for link in youtube_links:
-        try:
-            resp = requests.get(link).text
-            date = regex.findall(r'"dateText":{"simpleText":"(Jan|Feb|Mar|Apr|May|Jun|' +
-                                 r'Jul|Aug|Sep|Oct|Nov|Dec)[a-z]? (\d++), (\d++)"}', resp)
-            if len(date) == 1:
-                # This condition should always be true, but it is here just in case
-                date = date[0]
-                now = datetime.now()
-                if now.year == int(date[2]) and now.strftime("%b") == date[0]:
-                    if now.day <= int(date[1]) + OLD_VIDEO_THRES:
-                        return True, "Video is posted on {} {}, {}".format(date[0], date[1], date[2])
-        except Exception:
-            return False, ""
-    return False, ""
+    return scrap_and_check(youtube_links,
+                           r'"dateText":{"simpleText":"(Jan|Feb|Mar|Apr|May|Jun|' +
+                           r'Jul|Aug|Sep|Oct|Nov|Dec)[a-z]? (\d++), (\d++)"}',
+                           OLD_VIDEO_THRES,
+                           "Video")
 
 
 # noinspection PyUnusedLocal,PyMissingTypeHints

From a7762714e80882a7d463c41b0e50d992883b8033 Mon Sep 17 00:00:00 2001
From: user12986714 <65436504+user12986714@users.noreply.github.com>
Date: Wed, 2 Sep 2020 17:50:59 -0400
Subject: [PATCH 07/10] Detect medium posts too

---
 findspam.py | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/findspam.py b/findspam.py
index 107310ad26..814794554a 100644
--- a/findspam.py
+++ b/findspam.py
@@ -45,6 +45,7 @@
 REPEATED_CHARACTER_RATIO = 0.20
 IMG_TXT_R_THRES = 0.7
 OLD_VIDEO_THRES = 5
+OLD_MEDIUM_POST_THRES = 7
 EXCEPTION_RE = r"^Domain (.*) didn't .*!$"
 RE_COMPILE = regex.compile(EXCEPTION_RE)
 COMMON_MALFORMED_PROTOCOLS = [
@@ -653,6 +654,18 @@ def new_video(s, site):
                            "Video")
 
 
+@create_rule("Newly posted medium post")
+def new_medium_post(s, site):
+    medium_links_core = regex.findall(r"medium\.com\/@[\w-]*+\/[\w-]*+", s)
+    medium_links = ["https://" + x for x in medium_links_core]
+    return scrap_and_check(medium_links,
+                           r'<a class="bh bi at au av aw ax ay az ba fu bd bl bm" rel="noopener" ' +
+                           r'href="[^"]*+">(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]? ' +
+                           r'(\d++), (\d++)<\/a>',
+                           OLD_MEDIUM_POST_THRES,
+                           "Medium post")
+
+
 # noinspection PyUnusedLocal,PyMissingTypeHints
 @create_rule("repeating characters in {}", stripcodeblocks=True, max_rep=10000, max_score=10000)
 def has_repeating_characters(s, site):

From 51b48d92bc19244c289dc1427863a50ad940881e Mon Sep 17 00:00:00 2001
From: user12986714 <65436504+user12986714@users.noreply.github.com>
Date: Thu, 3 Sep 2020 16:22:40 -0400
Subject: [PATCH 08/10] Minor bug fix

---
 findspam.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/findspam.py b/findspam.py
index 814794554a..da8081b1dd 100644
--- a/findspam.py
+++ b/findspam.py
@@ -646,7 +646,7 @@ def new_video(s, site):
     # Youtube ID regex is by brunodles, https://stackoverflow.com/a/31711517
     youtube_ids = regex.findall(r"(?:https?:\/\/)?(?:www\.)?youtu\.?be(?:\.com)?\/" +
                                 r"?.*(?:watch|embed)?(?:.*v=|v\/|\/)([\w\-_]+)\&?", s)
-    youtube_links = ["https://" + x for x in youtube_ids]
+    youtube_links = ["https://youtu.be/" + x for x in youtube_ids]
     return scrap_and_check(youtube_links,
                            r'"dateText":{"simpleText":"(Jan|Feb|Mar|Apr|May|Jun|' +
                            r'Jul|Aug|Sep|Oct|Nov|Dec)[a-z]? (\d++), (\d++)"}',

From 08cbab42a7fdf7ead8a0e9ae1cc8c5f55563aee3 Mon Sep 17 00:00:00 2001
From: user12986714 <65436504+user12986714@users.noreply.github.com>
Date: Sun, 6 Sep 2020 10:23:02 -0400
Subject: [PATCH 09/10] Fix list index out of range

---
 findspam.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/findspam.py b/findspam.py
index da8081b1dd..5005fdab0e 100644
--- a/findspam.py
+++ b/findspam.py
@@ -635,7 +635,8 @@ def scrap_and_check(url_list, date_regex, thres, thing):
             resp = requests.get(link).text
             date = regex.findall(date_regex, resp)
             if len(date) == 1 and is_recent(date[0], now, thres):
-                return True, "{} is posted on {} {}, {}".format(thing, date[0], date[1], date[2])
+                return True, "{} is posted on {} {}, {}".format(thing, date[0][0],
+                                                                date[0][1], date[0][2])
         except Exception:
             pass
     return False, ""

From 0a94542615479ce1bfc833af0c00b4a05d0f02ab Mon Sep 17 00:00:00 2001
From: user12986714 <65436504+user12986714@users.noreply.github.com>
Date: Wed, 9 Sep 2020 11:33:42 -0400
Subject: [PATCH 10/10] Detect more medium links

---
 findspam.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/findspam.py b/findspam.py
index 5005fdab0e..c0fd0a9910 100644
--- a/findspam.py
+++ b/findspam.py
@@ -657,7 +657,7 @@ def new_video(s, site):
 
 @create_rule("Newly posted medium post")
 def new_medium_post(s, site):
-    medium_links_core = regex.findall(r"medium\.com\/@[\w-]*+\/[\w-]*+", s)
+    medium_links_core = regex.findall(r"medium\.com\/@?[\w-]*+\/[\w-]*+", s)
     medium_links = ["https://" + x for x in medium_links_core]
     return scrap_and_check(medium_links,
                            r'<a class="bh bi at au av aw ax ay az ba fu bd bl bm" rel="noopener" ' +