From 91d2566b1b6311c5764d0f391358079ab39bad5b Mon Sep 17 00:00:00 2001
From: user12986714 <65436504+user12986714@users.noreply.github.com>
Date: Tue, 1 Sep 2020 23:59:06 -0400
Subject: [PATCH 01/10] Add reason newly posted youtube video
---
findspam.py | 21 +++++++++++++++++++++
1 file changed, 21 insertions(+)
diff --git a/findspam.py b/findspam.py
index 3bb2a7dafd..c91b3b0be7 100644
--- a/findspam.py
+++ b/findspam.py
@@ -11,6 +11,7 @@
import time
import os
import os.path as path
+import requests
import regex
# noinspection PyPackageRequirements
@@ -44,6 +45,7 @@
PUNCTUATION_RATIO = 0.42
REPEATED_CHARACTER_RATIO = 0.20
IMG_TXT_R_THRES = 0.7
+OLD_VIDEO_THRES = 5
EXCEPTION_RE = r"^Domain (.*) didn't .*!$"
RE_COMPILE = regex.compile(EXCEPTION_RE)
COMMON_MALFORMED_PROTOCOLS = [
@@ -622,6 +624,25 @@ def mostly_img(s, site):
return False, ""
+@create_rule("Newly posted youtube video")
+def new_video(s, site):
+ youtube_links = regex.findall(r"https:\/\/youtu\.be\/[a-zA-Z0-9]*+", s)
+ for link in youtube_links:
+ try:
+ resp = Requests.get(link).text
+ date = regex.findall(r'"dateText":{"simpleText":"(Jan|Feb|Mar|Apr|May|Jun|' +
+ 'Jul|Aug|Sep|Oct|Nov|Dec)[a-z]? (\d++), (\d++)"}', resp)
+ if len(date) == 1:
+ # This condition should always be true, but it is here just in case
+ date = date[0]
+ now = datetime.now()
+ if now.year == int(date[2]) and now.strftime("%b") == date[0]:
+ if now.day <= int(date[1]) + OLD_VIDEO_THRES:
+ return True, "Video is posted on {} {}, {}".format(date[0], date[1], date[2])
+ except Exception:
+ return False, ""
+ return False, ""
+
# noinspection PyUnusedLocal,PyMissingTypeHints
@create_rule("repeating characters in {}", stripcodeblocks=True, max_rep=10000, max_score=10000)
def has_repeating_characters(s, site):
From 27dc15a3320c64aef26fc7ec8d90a3b577a137af Mon Sep 17 00:00:00 2001
From: user12986714 <65436504+user12986714@users.noreply.github.com>
Date: Wed, 2 Sep 2020 00:00:48 -0400
Subject: [PATCH 02/10] Fix CI
---
findspam.py | 1 +
1 file changed, 1 insertion(+)
diff --git a/findspam.py b/findspam.py
index c91b3b0be7..bb9cb89b5c 100644
--- a/findspam.py
+++ b/findspam.py
@@ -643,6 +643,7 @@ def new_video(s, site):
return False, ""
return False, ""
+
# noinspection PyUnusedLocal,PyMissingTypeHints
@create_rule("repeating characters in {}", stripcodeblocks=True, max_rep=10000, max_score=10000)
def has_repeating_characters(s, site):
From 958881afe3c3b02be069921f9239aa9e732875c6 Mon Sep 17 00:00:00 2001
From: user12986714 <65436504+user12986714@users.noreply.github.com>
Date: Wed, 2 Sep 2020 00:09:39 -0400
Subject: [PATCH 03/10] Fix CI... again
---
findspam.py | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/findspam.py b/findspam.py
index bb9cb89b5c..020abeb7ec 100644
--- a/findspam.py
+++ b/findspam.py
@@ -629,9 +629,9 @@ def new_video(s, site):
youtube_links = regex.findall(r"https:\/\/youtu\.be\/[a-zA-Z0-9]*+", s)
for link in youtube_links:
try:
- resp = Requests.get(link).text
+ resp = requests.get(link).text
date = regex.findall(r'"dateText":{"simpleText":"(Jan|Feb|Mar|Apr|May|Jun|' +
- 'Jul|Aug|Sep|Oct|Nov|Dec)[a-z]? (\d++), (\d++)"}', resp)
+ r'Jul|Aug|Sep|Oct|Nov|Dec)[a-z]? (\d++), (\d++)"}', resp)
if len(date) == 1:
# This condition should always be true, but it is here just in case
date = date[0]
From 308118c2492f25d01d040d7857a9b940dedb550f Mon Sep 17 00:00:00 2001
From: user12986714 <65436504+user12986714@users.noreply.github.com>
Date: Wed, 2 Sep 2020 00:13:54 -0400
Subject: [PATCH 04/10] Not to double import requests package
---
findspam.py | 1 -
1 file changed, 1 deletion(-)
diff --git a/findspam.py b/findspam.py
index 020abeb7ec..f47d52a538 100644
--- a/findspam.py
+++ b/findspam.py
@@ -11,7 +11,6 @@
import time
import os
import os.path as path
-import requests
import regex
# noinspection PyPackageRequirements
From 1fd725d05398bf4815ae37aa4c9546cbf0f76582 Mon Sep 17 00:00:00 2001
From: user12986714 <65436504+user12986714@users.noreply.github.com>
Date: Wed, 2 Sep 2020 10:31:46 -0400
Subject: [PATCH 05/10] Better regex
---
findspam.py | 5 ++++-
1 file changed, 4 insertions(+), 1 deletion(-)
diff --git a/findspam.py b/findspam.py
index f47d52a538..1240784888 100644
--- a/findspam.py
+++ b/findspam.py
@@ -625,7 +625,10 @@ def mostly_img(s, site):
@create_rule("Newly posted youtube video")
def new_video(s, site):
- youtube_links = regex.findall(r"https:\/\/youtu\.be\/[a-zA-Z0-9]*+", s)
+ # Youtube ID regex is by brunodles, https://stackoverflow.com/a/31711517
+ youtube_ids = regex.findall(r"(?:https?:\/\/)?(?:www\.)?youtu\.?be(?:\.com)?\/" +
+ r"?.*(?:watch|embed)?(?:.*v=|v\/|\/)([\w\-_]+)\&?", s)
+ youtube_links = ["https://" + x for x in youtube_ids]
for link in youtube_links:
try:
resp = requests.get(link).text
From b3d60152fa6752904bc9256f9f21bd34a801e1d5 Mon Sep 17 00:00:00 2001
From: user12986714 <65436504+user12986714@users.noreply.github.com>
Date: Wed, 2 Sep 2020 17:43:45 -0400
Subject: [PATCH 06/10] Break out scrap_and_check()
---
findspam.py | 37 ++++++++++++++++++++++---------------
1 file changed, 22 insertions(+), 15 deletions(-)
diff --git a/findspam.py b/findspam.py
index 1240784888..107310ad26 100644
--- a/findspam.py
+++ b/findspam.py
@@ -623,27 +623,34 @@ def mostly_img(s, site):
return False, ""
+def is_recent(date, now, thres):
+ return now.year == int(date[2]) and now.strftime("%b") == date[0] and now.day <= int(date[1]) + thres
+
+
+def scrap_and_check(url_list, date_regex, thres, thing):
+ now = datetime.now()
+ for link in url_list:
+ try:
+ resp = requests.get(link).text
+ date = regex.findall(date_regex, resp)
+ if len(date) == 1 and is_recent(date[0], now, thres):
+ return True, "{} is posted on {} {}, {}".format(thing, date[0], date[1], date[2])
+ except Exception:
+ pass
+ return False, ""
+
+
@create_rule("Newly posted youtube video")
def new_video(s, site):
# Youtube ID regex is by brunodles, https://stackoverflow.com/a/31711517
youtube_ids = regex.findall(r"(?:https?:\/\/)?(?:www\.)?youtu\.?be(?:\.com)?\/" +
r"?.*(?:watch|embed)?(?:.*v=|v\/|\/)([\w\-_]+)\&?", s)
youtube_links = ["https://" + x for x in youtube_ids]
- for link in youtube_links:
- try:
- resp = requests.get(link).text
- date = regex.findall(r'"dateText":{"simpleText":"(Jan|Feb|Mar|Apr|May|Jun|' +
- r'Jul|Aug|Sep|Oct|Nov|Dec)[a-z]? (\d++), (\d++)"}', resp)
- if len(date) == 1:
- # This condition should always be true, but it is here just in case
- date = date[0]
- now = datetime.now()
- if now.year == int(date[2]) and now.strftime("%b") == date[0]:
- if now.day <= int(date[1]) + OLD_VIDEO_THRES:
- return True, "Video is posted on {} {}, {}".format(date[0], date[1], date[2])
- except Exception:
- return False, ""
- return False, ""
+ return scrap_and_check(youtube_links,
+ r'"dateText":{"simpleText":"(Jan|Feb|Mar|Apr|May|Jun|' +
+ r'Jul|Aug|Sep|Oct|Nov|Dec)[a-z]? (\d++), (\d++)"}',
+ OLD_VIDEO_THRES,
+ "Video")
# noinspection PyUnusedLocal,PyMissingTypeHints
From a7762714e80882a7d463c41b0e50d992883b8033 Mon Sep 17 00:00:00 2001
From: user12986714 <65436504+user12986714@users.noreply.github.com>
Date: Wed, 2 Sep 2020 17:50:59 -0400
Subject: [PATCH 07/10] Detect medium posts too
---
findspam.py | 13 +++++++++++++
1 file changed, 13 insertions(+)
diff --git a/findspam.py b/findspam.py
index 107310ad26..814794554a 100644
--- a/findspam.py
+++ b/findspam.py
@@ -45,6 +45,7 @@
REPEATED_CHARACTER_RATIO = 0.20
IMG_TXT_R_THRES = 0.7
OLD_VIDEO_THRES = 5
+OLD_MEDIUM_POST_THRES = 7
EXCEPTION_RE = r"^Domain (.*) didn't .*!$"
RE_COMPILE = regex.compile(EXCEPTION_RE)
COMMON_MALFORMED_PROTOCOLS = [
@@ -653,6 +654,18 @@ def new_video(s, site):
"Video")
+@create_rule("Newly posted medium post")
+def new_medium_post(s, site):
+ medium_links_core = regex.findall(r"medium\.com\/@[\w-]*+\/[\w-]*+", s)
+ medium_links = ["https://" + x for x in medium_links_core]
+ return scrap_and_check(medium_links,
+ r'(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]? ' +
+ r'(\d++), (\d++)<\/a>',
+ OLD_MEDIUM_POST_THRES,
+ "Medium post")
+
+
# noinspection PyUnusedLocal,PyMissingTypeHints
@create_rule("repeating characters in {}", stripcodeblocks=True, max_rep=10000, max_score=10000)
def has_repeating_characters(s, site):
From 51b48d92bc19244c289dc1427863a50ad940881e Mon Sep 17 00:00:00 2001
From: user12986714 <65436504+user12986714@users.noreply.github.com>
Date: Thu, 3 Sep 2020 16:22:40 -0400
Subject: [PATCH 08/10] Minor bug fix
---
findspam.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/findspam.py b/findspam.py
index 814794554a..da8081b1dd 100644
--- a/findspam.py
+++ b/findspam.py
@@ -646,7 +646,7 @@ def new_video(s, site):
# Youtube ID regex is by brunodles, https://stackoverflow.com/a/31711517
youtube_ids = regex.findall(r"(?:https?:\/\/)?(?:www\.)?youtu\.?be(?:\.com)?\/" +
r"?.*(?:watch|embed)?(?:.*v=|v\/|\/)([\w\-_]+)\&?", s)
- youtube_links = ["https://" + x for x in youtube_ids]
+ youtube_links = ["https://youtu.be/" + x for x in youtube_ids]
return scrap_and_check(youtube_links,
r'"dateText":{"simpleText":"(Jan|Feb|Mar|Apr|May|Jun|' +
r'Jul|Aug|Sep|Oct|Nov|Dec)[a-z]? (\d++), (\d++)"}',
From 08cbab42a7fdf7ead8a0e9ae1cc8c5f55563aee3 Mon Sep 17 00:00:00 2001
From: user12986714 <65436504+user12986714@users.noreply.github.com>
Date: Sun, 6 Sep 2020 10:23:02 -0400
Subject: [PATCH 09/10] Fix list index out of range
---
findspam.py | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/findspam.py b/findspam.py
index da8081b1dd..5005fdab0e 100644
--- a/findspam.py
+++ b/findspam.py
@@ -635,7 +635,8 @@ def scrap_and_check(url_list, date_regex, thres, thing):
resp = requests.get(link).text
date = regex.findall(date_regex, resp)
if len(date) == 1 and is_recent(date[0], now, thres):
- return True, "{} is posted on {} {}, {}".format(thing, date[0], date[1], date[2])
+ return True, "{} is posted on {} {}, {}".format(thing, date[0][0],
+ date[0][1], date[0][2])
except Exception:
pass
return False, ""
From 0a94542615479ce1bfc833af0c00b4a05d0f02ab Mon Sep 17 00:00:00 2001
From: user12986714 <65436504+user12986714@users.noreply.github.com>
Date: Wed, 9 Sep 2020 11:33:42 -0400
Subject: [PATCH 10/10] Detect more medium links
---
findspam.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/findspam.py b/findspam.py
index 5005fdab0e..c0fd0a9910 100644
--- a/findspam.py
+++ b/findspam.py
@@ -657,7 +657,7 @@ def new_video(s, site):
@create_rule("Newly posted medium post")
def new_medium_post(s, site):
- medium_links_core = regex.findall(r"medium\.com\/@[\w-]*+\/[\w-]*+", s)
+ medium_links_core = regex.findall(r"medium\.com\/@?[\w-]*+\/[\w-]*+", s)
medium_links = ["https://" + x for x in medium_links_core]
return scrap_and_check(medium_links,
r'