From 91d2566b1b6311c5764d0f391358079ab39bad5b Mon Sep 17 00:00:00 2001 From: user12986714 <65436504+user12986714@users.noreply.github.com> Date: Tue, 1 Sep 2020 23:59:06 -0400 Subject: [PATCH 01/10] Add reason newly posted youtube video --- findspam.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/findspam.py b/findspam.py index 3bb2a7dafd..c91b3b0be7 100644 --- a/findspam.py +++ b/findspam.py @@ -11,6 +11,7 @@ import time import os import os.path as path +import requests import regex # noinspection PyPackageRequirements @@ -44,6 +45,7 @@ PUNCTUATION_RATIO = 0.42 REPEATED_CHARACTER_RATIO = 0.20 IMG_TXT_R_THRES = 0.7 +OLD_VIDEO_THRES = 5 EXCEPTION_RE = r"^Domain (.*) didn't .*!$" RE_COMPILE = regex.compile(EXCEPTION_RE) COMMON_MALFORMED_PROTOCOLS = [ @@ -622,6 +624,25 @@ def mostly_img(s, site): return False, "" +@create_rule("Newly posted youtube video") +def new_video(s, site): + youtube_links = regex.findall(r"https:\/\/youtu\.be\/[a-zA-Z0-9]*+", s) + for link in youtube_links: + try: + resp = Requests.get(link).text + date = regex.findall(r'"dateText":{"simpleText":"(Jan|Feb|Mar|Apr|May|Jun|' + + 'Jul|Aug|Sep|Oct|Nov|Dec)[a-z]? (\d++), (\d++)"}', resp) + if len(date) == 1: + # This condition should always be true, but it is here just in case + date = date[0] + now = datetime.now() + if now.year == int(date[2]) and now.strftime("%b") == date[0]: + if now.day <= int(date[1]) + OLD_VIDEO_THRES: + return True, "Video is posted on {} {}, {}".format(date[0], date[1], date[2]) + except Exception: + return False, "" + return False, "" + # noinspection PyUnusedLocal,PyMissingTypeHints @create_rule("repeating characters in {}", stripcodeblocks=True, max_rep=10000, max_score=10000) def has_repeating_characters(s, site): From 27dc15a3320c64aef26fc7ec8d90a3b577a137af Mon Sep 17 00:00:00 2001 From: user12986714 <65436504+user12986714@users.noreply.github.com> Date: Wed, 2 Sep 2020 00:00:48 -0400 Subject: [PATCH 02/10] Fix CI --- findspam.py | 1 + 1 file changed, 1 insertion(+) diff --git a/findspam.py b/findspam.py index c91b3b0be7..bb9cb89b5c 100644 --- a/findspam.py +++ b/findspam.py @@ -643,6 +643,7 @@ def new_video(s, site): return False, "" return False, "" + # noinspection PyUnusedLocal,PyMissingTypeHints @create_rule("repeating characters in {}", stripcodeblocks=True, max_rep=10000, max_score=10000) def has_repeating_characters(s, site): From 958881afe3c3b02be069921f9239aa9e732875c6 Mon Sep 17 00:00:00 2001 From: user12986714 <65436504+user12986714@users.noreply.github.com> Date: Wed, 2 Sep 2020 00:09:39 -0400 Subject: [PATCH 03/10] Fix CI... again --- findspam.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/findspam.py b/findspam.py index bb9cb89b5c..020abeb7ec 100644 --- a/findspam.py +++ b/findspam.py @@ -629,9 +629,9 @@ def new_video(s, site): youtube_links = regex.findall(r"https:\/\/youtu\.be\/[a-zA-Z0-9]*+", s) for link in youtube_links: try: - resp = Requests.get(link).text + resp = requests.get(link).text date = regex.findall(r'"dateText":{"simpleText":"(Jan|Feb|Mar|Apr|May|Jun|' + - 'Jul|Aug|Sep|Oct|Nov|Dec)[a-z]? (\d++), (\d++)"}', resp) + r'Jul|Aug|Sep|Oct|Nov|Dec)[a-z]? (\d++), (\d++)"}', resp) if len(date) == 1: # This condition should always be true, but it is here just in case date = date[0] From 308118c2492f25d01d040d7857a9b940dedb550f Mon Sep 17 00:00:00 2001 From: user12986714 <65436504+user12986714@users.noreply.github.com> Date: Wed, 2 Sep 2020 00:13:54 -0400 Subject: [PATCH 04/10] Not to double import requests package --- findspam.py | 1 - 1 file changed, 1 deletion(-) diff --git a/findspam.py b/findspam.py index 020abeb7ec..f47d52a538 100644 --- a/findspam.py +++ b/findspam.py @@ -11,7 +11,6 @@ import time import os import os.path as path -import requests import regex # noinspection PyPackageRequirements From 1fd725d05398bf4815ae37aa4c9546cbf0f76582 Mon Sep 17 00:00:00 2001 From: user12986714 <65436504+user12986714@users.noreply.github.com> Date: Wed, 2 Sep 2020 10:31:46 -0400 Subject: [PATCH 05/10] Better regex --- findspam.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/findspam.py b/findspam.py index f47d52a538..1240784888 100644 --- a/findspam.py +++ b/findspam.py @@ -625,7 +625,10 @@ def mostly_img(s, site): @create_rule("Newly posted youtube video") def new_video(s, site): - youtube_links = regex.findall(r"https:\/\/youtu\.be\/[a-zA-Z0-9]*+", s) + # Youtube ID regex is by brunodles, https://stackoverflow.com/a/31711517 + youtube_ids = regex.findall(r"(?:https?:\/\/)?(?:www\.)?youtu\.?be(?:\.com)?\/" + + r"?.*(?:watch|embed)?(?:.*v=|v\/|\/)([\w\-_]+)\&?", s) + youtube_links = ["https://" + x for x in youtube_ids] for link in youtube_links: try: resp = requests.get(link).text From b3d60152fa6752904bc9256f9f21bd34a801e1d5 Mon Sep 17 00:00:00 2001 From: user12986714 <65436504+user12986714@users.noreply.github.com> Date: Wed, 2 Sep 2020 17:43:45 -0400 Subject: [PATCH 06/10] Break out scrap_and_check() --- findspam.py | 37 ++++++++++++++++++++++--------------- 1 file changed, 22 insertions(+), 15 deletions(-) diff --git a/findspam.py b/findspam.py index 1240784888..107310ad26 100644 --- a/findspam.py +++ b/findspam.py @@ -623,27 +623,34 @@ def mostly_img(s, site): return False, "" +def is_recent(date, now, thres): + return now.year == int(date[2]) and now.strftime("%b") == date[0] and now.day <= int(date[1]) + thres + + +def scrap_and_check(url_list, date_regex, thres, thing): + now = datetime.now() + for link in url_list: + try: + resp = requests.get(link).text + date = regex.findall(date_regex, resp) + if len(date) == 1 and is_recent(date[0], now, thres): + return True, "{} is posted on {} {}, {}".format(thing, date[0], date[1], date[2]) + except Exception: + pass + return False, "" + + @create_rule("Newly posted youtube video") def new_video(s, site): # Youtube ID regex is by brunodles, https://stackoverflow.com/a/31711517 youtube_ids = regex.findall(r"(?:https?:\/\/)?(?:www\.)?youtu\.?be(?:\.com)?\/" + r"?.*(?:watch|embed)?(?:.*v=|v\/|\/)([\w\-_]+)\&?", s) youtube_links = ["https://" + x for x in youtube_ids] - for link in youtube_links: - try: - resp = requests.get(link).text - date = regex.findall(r'"dateText":{"simpleText":"(Jan|Feb|Mar|Apr|May|Jun|' + - r'Jul|Aug|Sep|Oct|Nov|Dec)[a-z]? (\d++), (\d++)"}', resp) - if len(date) == 1: - # This condition should always be true, but it is here just in case - date = date[0] - now = datetime.now() - if now.year == int(date[2]) and now.strftime("%b") == date[0]: - if now.day <= int(date[1]) + OLD_VIDEO_THRES: - return True, "Video is posted on {} {}, {}".format(date[0], date[1], date[2]) - except Exception: - return False, "" - return False, "" + return scrap_and_check(youtube_links, + r'"dateText":{"simpleText":"(Jan|Feb|Mar|Apr|May|Jun|' + + r'Jul|Aug|Sep|Oct|Nov|Dec)[a-z]? (\d++), (\d++)"}', + OLD_VIDEO_THRES, + "Video") # noinspection PyUnusedLocal,PyMissingTypeHints From a7762714e80882a7d463c41b0e50d992883b8033 Mon Sep 17 00:00:00 2001 From: user12986714 <65436504+user12986714@users.noreply.github.com> Date: Wed, 2 Sep 2020 17:50:59 -0400 Subject: [PATCH 07/10] Detect medium posts too --- findspam.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/findspam.py b/findspam.py index 107310ad26..814794554a 100644 --- a/findspam.py +++ b/findspam.py @@ -45,6 +45,7 @@ REPEATED_CHARACTER_RATIO = 0.20 IMG_TXT_R_THRES = 0.7 OLD_VIDEO_THRES = 5 +OLD_MEDIUM_POST_THRES = 7 EXCEPTION_RE = r"^Domain (.*) didn't .*!$" RE_COMPILE = regex.compile(EXCEPTION_RE) COMMON_MALFORMED_PROTOCOLS = [ @@ -653,6 +654,18 @@ def new_video(s, site): "Video") +@create_rule("Newly posted medium post") +def new_medium_post(s, site): + medium_links_core = regex.findall(r"medium\.com\/@[\w-]*+\/[\w-]*+", s) + medium_links = ["https://" + x for x in medium_links_core] + return scrap_and_check(medium_links, + r'(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]? ' + + r'(\d++), (\d++)<\/a>', + OLD_MEDIUM_POST_THRES, + "Medium post") + + # noinspection PyUnusedLocal,PyMissingTypeHints @create_rule("repeating characters in {}", stripcodeblocks=True, max_rep=10000, max_score=10000) def has_repeating_characters(s, site): From 51b48d92bc19244c289dc1427863a50ad940881e Mon Sep 17 00:00:00 2001 From: user12986714 <65436504+user12986714@users.noreply.github.com> Date: Thu, 3 Sep 2020 16:22:40 -0400 Subject: [PATCH 08/10] Minor bug fix --- findspam.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/findspam.py b/findspam.py index 814794554a..da8081b1dd 100644 --- a/findspam.py +++ b/findspam.py @@ -646,7 +646,7 @@ def new_video(s, site): # Youtube ID regex is by brunodles, https://stackoverflow.com/a/31711517 youtube_ids = regex.findall(r"(?:https?:\/\/)?(?:www\.)?youtu\.?be(?:\.com)?\/" + r"?.*(?:watch|embed)?(?:.*v=|v\/|\/)([\w\-_]+)\&?", s) - youtube_links = ["https://" + x for x in youtube_ids] + youtube_links = ["https://youtu.be/" + x for x in youtube_ids] return scrap_and_check(youtube_links, r'"dateText":{"simpleText":"(Jan|Feb|Mar|Apr|May|Jun|' + r'Jul|Aug|Sep|Oct|Nov|Dec)[a-z]? (\d++), (\d++)"}', From 08cbab42a7fdf7ead8a0e9ae1cc8c5f55563aee3 Mon Sep 17 00:00:00 2001 From: user12986714 <65436504+user12986714@users.noreply.github.com> Date: Sun, 6 Sep 2020 10:23:02 -0400 Subject: [PATCH 09/10] Fix list index out of range --- findspam.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/findspam.py b/findspam.py index da8081b1dd..5005fdab0e 100644 --- a/findspam.py +++ b/findspam.py @@ -635,7 +635,8 @@ def scrap_and_check(url_list, date_regex, thres, thing): resp = requests.get(link).text date = regex.findall(date_regex, resp) if len(date) == 1 and is_recent(date[0], now, thres): - return True, "{} is posted on {} {}, {}".format(thing, date[0], date[1], date[2]) + return True, "{} is posted on {} {}, {}".format(thing, date[0][0], + date[0][1], date[0][2]) except Exception: pass return False, "" From 0a94542615479ce1bfc833af0c00b4a05d0f02ab Mon Sep 17 00:00:00 2001 From: user12986714 <65436504+user12986714@users.noreply.github.com> Date: Wed, 9 Sep 2020 11:33:42 -0400 Subject: [PATCH 10/10] Detect more medium links --- findspam.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/findspam.py b/findspam.py index 5005fdab0e..c0fd0a9910 100644 --- a/findspam.py +++ b/findspam.py @@ -657,7 +657,7 @@ def new_video(s, site): @create_rule("Newly posted medium post") def new_medium_post(s, site): - medium_links_core = regex.findall(r"medium\.com\/@[\w-]*+\/[\w-]*+", s) + medium_links_core = regex.findall(r"medium\.com\/@?[\w-]*+\/[\w-]*+", s) medium_links = ["https://" + x for x in medium_links_core] return scrap_and_check(medium_links, r'