Skip to content

Commit

Permalink
Merge pull request #39 from Slashdacoda/master
Browse files Browse the repository at this point in the history
merge from master to update
  • Loading branch information
Slashdacoda authored Jul 28, 2021
2 parents 9e95b9d + 6a21339 commit a32cd84
Show file tree
Hide file tree
Showing 21 changed files with 553 additions and 226 deletions.
12 changes: 12 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,19 @@ AapFactCheckingSiteExtractor_extraction_failed.log
FatabyyanoFactCheckingSiteExtractor_extraction_failed.log
VishvasnewsFactCheckingSiteExtractor_extraction_failed.log
SnopesFactCheckingSiteExtractor_extraction_failed.log
EuvsdisinfoFactCheckingSiteExtractor_extraction_failed.log
PolitifactFactCheckingSiteExtractor_extraction_failed.log
TruthorfictionFactCheckingSiteExtractor_extraction_failed.log
CheckyourfactFactCheckingSiteExtractor_extraction_failed.log
AfricacheckFactCheckingSiteExtractor_extraction_failed.log
AfpfactcheckFactCheckingSiteExtractor_extraction_failed.log
output_dev_fatabyyano.csv
output_dev_vishvasnews.csv
output_dev_aap.csv
output_dev_fullfact.csv
output_dev_snopes.csv
output_dev_politifact.csv
output_dev_truthorfiction.csv
output_dev_checkyourfact.csv
output_dev_africacheck.csv
output_dev_afpfactcheck.csv
4 changes: 2 additions & 2 deletions Exporter.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,8 +38,8 @@ def main(argv):
if opt == '--maxclaims':
criteria.maxClaims = int(arg)
if criteria.website != "":
criteria.setOutputDev("output_dev_" + criteria.website + ".csv")
criteria.setOutputSample("output_sample_" + criteria.website + ".csv")
criteria.setOutputDev("samples/output_dev_" + criteria.website + ".csv")
criteria.setOutputSample("samples/output_sample_" + criteria.website + ".csv")
if opt == '--annotation-api':
criteria.annotator_uri = arg

Expand Down
2 changes: 1 addition & 1 deletion claim_extractor/extractors/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ def __init__(self, configuration: Configuration = Configuration(), ignore_urls:
self.configuration = configuration
self.ignore_urls = configuration.avoid_urls
self.language = language
self.failed_log = open(self.__class__.__name__ + "_extraction_failed.log", "w")
self.failed_log = open("failed/" + self.__class__.__name__ + "_extraction_failed.log", "w")
self.annotator = EntityFishingAnnotator(configuration.annotator_uri)

def get_all_claims(self):
Expand Down
11 changes: 8 additions & 3 deletions claim_extractor/extractors/afpfactcheck.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,8 @@ def retrieve_urls(self, parsed_listing_page: BeautifulSoup, listing_page_url: st
str]:
urls = self.extract_urls(parsed_listing_page)
for page_number in trange(1, number_of_pages):
if ((page_number*15) + 14 >= self.configuration.maxClaims):
break
url = listing_page_url + "?page=" + str(int(page_number))
page = caching.get(url, headers=self.headers, timeout=20)
current_parsed_listing_page = BeautifulSoup(page, "lxml")
Expand Down Expand Up @@ -102,6 +104,8 @@ def extract_claim_and_review(self, parsed_claim_review_page: BeautifulSoup, url:
claim.set_date(data['@graph'][0]['itemReviewed']['datePublished'])
except Exception:
pass
except KeyError:
pass

try:
date = data['@graph'][0]['datePublished']
Expand All @@ -118,9 +122,10 @@ def extract_claim_and_review(self, parsed_claim_review_page: BeautifulSoup, url:
try:
if child.name == 'aside':
continue
elems = child.findAll('a')
for elem in elems:
links.append(elem['href'])
if (child != "\n" and not " " ):
elems = child.findAll('a')
for elem in elems:
links.append(elem['href'])
except Exception as e:
continue
claim.set_refered_links(links)
Expand Down
269 changes: 139 additions & 130 deletions claim_extractor/extractors/africacheck.py

Large diffs are not rendered by default.

5 changes: 4 additions & 1 deletion claim_extractor/extractors/checkyourfact.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,14 @@ def retrieve_listing_page_urls(self) -> List[str]:
return ["https://checkyourfact.com/page/1/"]

def find_page_count(self, parsed_listing_page: BeautifulSoup) -> int:
count = 26
count = 1
url = "https://checkyourfact.com/page/" + str(count + 1)
result = caching.get(url, headers=self.headers, timeout=10)
if result:
while result:
# each page 20 articles:
if (((count+1)*20)-20 >= self.configuration.maxClaims):
break
count += 1
url = "https://checkyourfact.com/page/" + str(count)
result = caching.get(url, headers=self.headers, timeout=10)
Expand Down
46 changes: 38 additions & 8 deletions claim_extractor/extractors/euvsdisinfo.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,33 @@
class EuvsdisinfoFactCheckingSiteExtractor(FactCheckingSiteExtractor):

def retrieve_listing_page_urls(self) -> List[str]:
data = caching.get('https://euvsdisinfo.eu/disinformation-cases')
soup = BeautifulSoup(data, 'html.parser')
nb = self.find_page_count(soup)
links = []
for x in range(0, int(nb/10)):
links.append('https://euvsdisinfo.eu/disinformation-cases/?offset='+str(x*10))
different_categories_value = ["disinformation-cases"]
url_begins = [
"https://euvsdisinfo.eu/",
"https://euvsdisinfo.eu/ru/",
"https://euvsdisinfo.eu/it/",
"https://euvsdisinfo.eu/es/",
"https://euvsdisinfo.eu/fr/",
"https://euvsdisinfo.eu/de/"]


for url in url_begins:
for value in different_categories_value:
#different_urls.append(url + value + "/")
# data = caching.get(f""+ url + value + "/")
data = caching.get("https://euvsdisinfo.eu/disinformation-cases/", headers=self.headers, timeout=15)
soup = BeautifulSoup(data, 'html.parser')
nb = self.find_page_count(soup)
for x in range(0, int(nb/10)):
links.append(url + value + '/?offset=' + str(x*10))

# data = caching.get('https://euvsdisinfo.eu/disinformation-cases')
# soup = BeautifulSoup(data, 'html.parser')
# nb = self.find_page_count(soup)
# links = []
# for x in range(0, int(nb/10)):
# links.append('https://euvsdisinfo.eu/disinformation-cases/?offset='+str(x*10))
return links

def find_page_count(self, parsed_listing_page: BeautifulSoup) -> int:
Expand All @@ -30,10 +51,19 @@ def find_page_count(self, parsed_listing_page: BeautifulSoup) -> int:

def retrieve_urls(self, parsed_listing_page: BeautifulSoup, listing_page_url: str, number_of_pages: int) -> Set[str]:
urls = []
tmp_counter = 0
elems = parsed_listing_page.findAll('div', {'class':'disinfo-db-post'})
for elem in elems:
url = elem.find('a')
urls.append(url['href'])

if self.configuration.maxClaims >= 1:
for elem in elems:
if tmp_counter < self.configuration.maxClaims:
tmp_counter += 1
url = elem.find('a')
urls.append(url['href'])
else:
for elem in elems:
url = elem.find('a')
urls.append(url['href'])
return urls

def extract_claim_and_review(self, parsed_claim_review_page: BeautifulSoup, url: str) -> List[Claim]:
Expand Down
107 changes: 88 additions & 19 deletions claim_extractor/extractors/politifact.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,14 +27,24 @@ def retrieve_urls(self, parsed_listing_page: BeautifulSoup, listing_page_url: st
-> List[str]:
urls = self.extract_urls(parsed_listing_page)
page_number = 2
while True:
while True and ((page_number*30) <= self.configuration.maxClaims):
url = listing_page_url + "?page=" + str(page_number)
page = caching.get(url, headers=self.headers, timeout=5)
if not page:
if page is not None:
current_parsed_listing_page = BeautifulSoup(page, "lxml")
else:
break
current_parsed_listing_page = BeautifulSoup(page, "lxml")
urls += self.extract_urls(current_parsed_listing_page)
return urls

nav_buttons = current_parsed_listing_page.find_all("section", attrs={'class': 't-row'})
nav_buttons = nav_buttons[-1].find_all("li", attrs={'class': 'm-list__item'})

if len(nav_buttons) == 1:
break
else:
urls += self.extract_urls(current_parsed_listing_page)
page_number += 1
#print("\rr: " + url)
return urls

def extract_urls(self, parsed_listing_page: BeautifulSoup):
urls = list()
Expand All @@ -54,16 +64,17 @@ def extract_urls(self, parsed_listing_page: BeautifulSoup):
def extract_claim_and_review(self, parsed_claim_review_page: BeautifulSoup, url: str) -> List[Claim]:
claim = Claim()
claim.set_url(url)
claim.set_source("politifact")
#print("\r" + url)

claim.set_source("politifact")

# Claim
title = parsed_claim_review_page.find("div", {"class": "m-statement__quote"})
claim.set_claim(title.text)
claim.set_claim(title.text.strip())

# title
title = parsed_claim_review_page.find("h2", {"class": "c-title"})
claim.set_title(title.text)
claim.set_title(title.text.strip())

# date
date = parsed_claim_review_page.find('span', {"class": "m-author__date"})
Expand All @@ -72,24 +83,69 @@ def extract_claim_and_review(self, parsed_claim_review_page: BeautifulSoup, url:
claim.set_date(date_str)

# rating
# https://static.politifact.com/politifact/rulings/meter-mostly-false.jpg
statement_body=parsed_claim_review_page.find("div", {"class", "m-statement__body"})
statement_detail = statement_body.find("div", {"class", "c-image"})
statement_detail_image=statement_detail.find("picture")
statement_detail_image_alt=statement_detail_image.find("img",{"class", "c-image__original"})
if statement_detail_image_alt:
claim.alternate_name = statement_detail_image_alt['alt']
#claim.alternate_name = statement_detail_image_alt['src'].split("rulings/")[1].split(".jpg")[0]
if self.translate_rating_value(statement_detail_image_alt['alt']) != "":
claim.rating = self.translate_rating_value(statement_detail_image_alt['alt'])
else:
claim.rating = statement_detail_image_alt['alt']

# body
body = parsed_claim_review_page.find("article", {"class": "m-textblock"})
claim.set_body(body.get_text())
#body.find("div", {"class": "artembed"}).decompose()
#claim.set_body(body.get_text())





text =""
if parsed_claim_review_page.select( 'main > section > div.t-row__center > article.m-textblock' ):
for child in parsed_claim_review_page.select( 'main > section > div.t-row__center > article.m-textblock' ):
for element in child.contents:
if (element.name == "div"):
valid = True
# check for illegal JS element in artembed (tag):
if (hasattr( element, 'class' )):
try:
if ('class' in element.attrs):
if (element.attrs['class'][0] == "artembed"):
if (element.text.startswith("\r\nwindow.gciAnalyticsUAID")):
valid = False
except KeyError:
print("KeyError: Skip")
else:
valid = True
if hasattr( element, 'text' ):
#if (element.text == "We rate this claim False." and url == "https://www.politifact.com/staff/kelsey-tamakloe/"):
if (url == "https://www.politifact.com/staff/kelsey-tamakloe/"):
print("\r" + str(element.text))
if (valid == True):
if (element):
if (hasattr( element, 'text' )):
text += " " + str(element.text)
else:
text += " " + str(element)

body_description = text.strip()
claim.body = str(body_description).strip()

# author
statement_meta = parsed_claim_review_page.find("div", {"class": "m-statement__meta"})
if statement_meta:
author = statement_meta.find("a").text
author_meta = parsed_claim_review_page.find("div", {"class": "m-author__content"})
if author_meta:
author = author_meta.find("a").text
claim.set_author(author)
author_url = author_meta.find("a")
if author_url.attrs["href"] != "":
claim.author_url = "https://www.politifact.com" + author_url.attrs["href"]

# date published
statement_meta = parsed_claim_review_page.find("div", {"class": "m-statement__meta"})
if statement_meta:
meta_text = statement_meta.text
if "on" in meta_text:
Expand All @@ -100,13 +156,16 @@ def extract_claim_and_review(self, parsed_claim_review_page: BeautifulSoup, url:
date = search_dates(meta_text)
if date:
date = date[0][1].strftime("%Y-%m-%d")
claim.setDatePublished(date)
claim.date = date

# related links
div_tag = parsed_claim_review_page.find("article", {"class": "m-textblock"})
related_links = []
for link in body.find_all('a', href=True):
related_links.append(link['href'])
if (link['href'][0] == "/"):
related_links.append("https://www.politifact.com" + link['href'])
else:
related_links.append(link['href'])
claim.set_refered_links(related_links)

claim.set_claim(parsed_claim_review_page.find("div", {"class": "m-statement__quote"}).text.strip())
Expand All @@ -129,7 +188,17 @@ def extract_claim_and_review(self, parsed_claim_review_page: BeautifulSoup, url:

return [claim]





def translate_rating_value(self, initial_rating_value: str) -> str:
dictionary = {
"true": "True",
"mostly-true": "Mostly True",
"half-true": "Half False",
"barely-true": "Mostly False",
"false": "False",
"pants-fire": "Pants on Fire"
}

if initial_rating_value in dictionary:
return dictionary[initial_rating_value]
else:
return ""
2 changes: 2 additions & 0 deletions claim_extractor/extractors/snopes.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,8 @@ def extract_claim_and_review(self, parsed_claim_review_page: BeautifulSoup, url:
if parsed_claim_review_page.select( 'article > div > div > div > div.media-body > span' ):
for rating_span in parsed_claim_review_page.select( 'article > div > div > div > div.media-body > span' ):
rating = rating_span.text.strip()
if (rating != ""):
break
claim.rating = str(rating).replace('"', "").strip()
# claim.set_rating_value( rating )

Expand Down
22 changes: 13 additions & 9 deletions claim_extractor/extractors/truthorfiction.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,16 +18,21 @@ def retrieve_listing_page_urls(self) -> List[str]:
return ["https://www.truthorfiction.com/category/fact-checks/"]

def find_page_count(self, parsed_listing_page: BeautifulSoup) -> int:
page_nav = parsed_listing_page.find("div", {"class": "nav-links"})
last_page_link = page_nav.findAll("a")[-2]['href']
page_nav = parsed_listing_page.find("div", {"class": "nav-previous"})
last_page_link = page_nav.findAll("a")[0]['href']
page_re = re.compile("https://www.truthorfiction.com/category/fact-checks/page/([0-9]+)/")
max_page = int(page_re.match(last_page_link).group(1))
if (max_page >= 2) and ((max_page*10) <= self.configuration.maxClaims):
page = caching.get(last_page_link, headers=self.headers, timeout=5)
if page:
parsed_listing_page = BeautifulSoup(page, self.configuration.parser_engine)
max_page = self.find_page_count(parsed_listing_page)
return max_page

def retrieve_urls(self, parsed_listing_page: BeautifulSoup, listing_page_url: str, number_of_pages: int) \
-> List[str]:
urls = self.extract_urls(parsed_listing_page)
for page_number in tqdm(range(2, number_of_pages)):
urls = [] #self.extract_urls(parsed_listing_page)
for page_number in tqdm(range(1, number_of_pages)):
url = "https://www.truthorfiction.com/category/fact-checks/page/" + str(page_number) + "/"
page = caching.get(url, headers=self.headers, timeout=20)
current_parsed_listing_page = BeautifulSoup(page, "lxml")
Expand All @@ -36,10 +41,9 @@ def retrieve_urls(self, parsed_listing_page: BeautifulSoup, listing_page_url: st

def extract_urls(self, parsed_listing_page: BeautifulSoup):
urls = list()
listing_container = parsed_listing_page.find("div", {"class": "ast-row"})
titles = listing_container.findAll("h2")
for title in titles:
anchor = title.find("a")
listing_container = parsed_listing_page.find_all("article", {"class": "post"})
for article in listing_container:
anchor = article.find("a")
url = str(anchor['href'])
max_claims = self.configuration.maxClaims
if 0 < max_claims <= len(urls):
Expand Down
Loading

0 comments on commit a32cd84

Please sign in to comment.