Skip to content

Commit

Permalink
Merge pull request #44 from Slashdacoda/master
Browse files Browse the repository at this point in the history
merge from master to update
  • Loading branch information
Slashdacoda authored Jul 28, 2021
2 parents d1eab02 + 2d3d04e commit c31e526
Show file tree
Hide file tree
Showing 20 changed files with 362 additions and 266 deletions.
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@ EuvsdisinfoFactCheckingSiteExtractor_extraction_failed.log
PolitifactFactCheckingSiteExtractor_extraction_failed.log
TruthorfictionFactCheckingSiteExtractor_extraction_failed.log
CheckyourfactFactCheckingSiteExtractor_extraction_failed.log
AfricacheckFactCheckingSiteExtractor_extraction_failed.log
AfpfactcheckFactCheckingSiteExtractor_extraction_failed.log
output_dev_fatabyyano.csv
output_dev_vishvasnews.csv
output_dev_aap.csv
Expand All @@ -28,3 +30,5 @@ output_dev_snopes.csv
output_dev_politifact.csv
output_dev_truthorfiction.csv
output_dev_checkyourfact.csv
output_dev_africacheck.csv
output_dev_afpfactcheck.csv
4 changes: 2 additions & 2 deletions Exporter.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,8 +38,8 @@ def main(argv):
if opt == '--maxclaims':
criteria.maxClaims = int(arg)
if criteria.website != "":
criteria.setOutputDev("output_dev_" + criteria.website + ".csv")
criteria.setOutputSample("output_sample_" + criteria.website + ".csv")
criteria.setOutputDev("samples/output_dev_" + criteria.website + ".csv")
criteria.setOutputSample("samples/output_sample_" + criteria.website + ".csv")
if opt == '--annotation-api':
criteria.annotator_uri = arg

Expand Down
2 changes: 1 addition & 1 deletion claim_extractor/extractors/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ def __init__(self, configuration: Configuration = Configuration(), ignore_urls:
self.configuration = configuration
self.ignore_urls = configuration.avoid_urls
self.language = language
self.failed_log = open(self.__class__.__name__ + "_extraction_failed.log", "w")
self.failed_log = open("failed/" + self.__class__.__name__ + "_extraction_failed.log", "w")
self.annotator = EntityFishingAnnotator(configuration.annotator_uri)

def get_all_claims(self):
Expand Down
11 changes: 8 additions & 3 deletions claim_extractor/extractors/afpfactcheck.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,8 @@ def retrieve_urls(self, parsed_listing_page: BeautifulSoup, listing_page_url: st
str]:
urls = self.extract_urls(parsed_listing_page)
for page_number in trange(1, number_of_pages):
if ((page_number*15) + 14 >= self.configuration.maxClaims):
break
url = listing_page_url + "?page=" + str(int(page_number))
page = caching.get(url, headers=self.headers, timeout=20)
current_parsed_listing_page = BeautifulSoup(page, "lxml")
Expand Down Expand Up @@ -102,6 +104,8 @@ def extract_claim_and_review(self, parsed_claim_review_page: BeautifulSoup, url:
claim.set_date(data['@graph'][0]['itemReviewed']['datePublished'])
except Exception:
pass
except KeyError:
pass

try:
date = data['@graph'][0]['datePublished']
Expand All @@ -118,9 +122,10 @@ def extract_claim_and_review(self, parsed_claim_review_page: BeautifulSoup, url:
try:
if child.name == 'aside':
continue
elems = child.findAll('a')
for elem in elems:
links.append(elem['href'])
if (child != "\n" and not " " ):
elems = child.findAll('a')
for elem in elems:
links.append(elem['href'])
except Exception as e:
continue
claim.set_refered_links(links)
Expand Down
269 changes: 139 additions & 130 deletions claim_extractor/extractors/africacheck.py

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion claim_extractor/extractors/fullfact.py
Original file line number Diff line number Diff line change
Expand Up @@ -219,7 +219,7 @@ def extract_claim_and_review(self, parsed_claim_review_page: BeautifulSoup, url:

# Create multiple claims from the main one and add change then the claim text and verdict (rating):
c = 0
while c < len(claim_text_list)-1:
while c <= len(claim_text_list)-1:
claims.append(claim)
claims[c].claim = claim_text_list[c]
claims[c].rating = claim_verdict_list[c]
Expand Down
23 changes: 18 additions & 5 deletions claim_extractor/extractors/truthorfiction.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,12 +63,22 @@ def extract_claim_and_review(self, parsed_claim_review_page: BeautifulSoup, url:
article = parsed_claim_review_page.find("article")

# date

date_ = parsed_claim_review_page.find('meta', {"property": "article:published_time"})['content']
if date_:
date_str = date_.split("T")[0]
claim.set_date(date_str)

# author
author_ = parsed_claim_review_page.find('meta', {"name": "author"})['content']
if author_:
author_str = author_.split("T")[0]
claim.set_author(author_str)

## auth link
author_url = parsed_claim_review_page.find('a', {"class": "url fn n"})['href']
if author_url:
claim.author_url = author_url

# body
content = [tag for tag in article.contents if not isinstance(tag, NavigableString)]
body = content[-1] # type: Tag
Expand Down Expand Up @@ -99,12 +109,15 @@ def extract_claim_and_review(self, parsed_claim_review_page: BeautifulSoup, url:
else:
claim.set_rating(rating_text)
claim.set_claim(claim_text)


# tags
tags = []
if parsed_claim_review_page.select('footer > span.tags-links > a'):
for link in parsed_claim_review_page.select('footer > span.tags-links > a'):
if hasattr(link, 'href'):
#tag_link = link['href']
tags.append(link.text)

for tag in parsed_claim_review_page.findAll("meta", {"property", "article:tags"}):
tag_str = tag['content']
tags.append(tag_str)
claim.set_tags(", ".join(tags))

return [claim]
69 changes: 30 additions & 39 deletions output_got.csv

Large diffs are not rendered by default.

54 changes: 0 additions & 54 deletions output_sample_fullfact.csv

This file was deleted.

Loading

0 comments on commit c31e526

Please sign in to comment.