Skip to content

Commit

Permalink
Merge pull request #42 from Slashdacoda/master
Browse files Browse the repository at this point in the history
merge from master to update
  • Loading branch information
Slashdacoda authored Jul 28, 2021
2 parents ab950c8 + 1d64d34 commit afeb9c5
Show file tree
Hide file tree
Showing 19 changed files with 374 additions and 223 deletions.
8 changes: 7 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,17 @@ VishvasnewsFactCheckingSiteExtractor_extraction_failed.log
SnopesFactCheckingSiteExtractor_extraction_failed.log
EuvsdisinfoFactCheckingSiteExtractor_extraction_failed.log
PolitifactFactCheckingSiteExtractor_extraction_failed.log
TruthorfictionFactCheckingSiteExtractor_extraction_failed.log
CheckyourfactFactCheckingSiteExtractor_extraction_failed.log
AfricacheckFactCheckingSiteExtractor_extraction_failed.log
AfpfactcheckFactCheckingSiteExtractor_extraction_failed.log
output_dev_fatabyyano.csv
output_dev_vishvasnews.csv
output_dev_aap.csv
output_dev_fullfact.csv
output_dev_snopes.csv
output_dev_politifact.csv
TruthorfictionFactCheckingSiteExtractor_extraction_failed.log
output_dev_truthorfiction.csv
output_dev_checkyourfact.csv
output_dev_africacheck.csv
output_dev_afpfactcheck.csv
4 changes: 2 additions & 2 deletions Exporter.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,8 +38,8 @@ def main(argv):
if opt == '--maxclaims':
criteria.maxClaims = int(arg)
if criteria.website != "":
criteria.setOutputDev("output_dev_" + criteria.website + ".csv")
criteria.setOutputSample("output_sample_" + criteria.website + ".csv")
criteria.setOutputDev("samples/output_dev_" + criteria.website + ".csv")
criteria.setOutputSample("samples/output_sample_" + criteria.website + ".csv")
if opt == '--annotation-api':
criteria.annotator_uri = arg

Expand Down
2 changes: 1 addition & 1 deletion claim_extractor/extractors/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ def __init__(self, configuration: Configuration = Configuration(), ignore_urls:
self.configuration = configuration
self.ignore_urls = configuration.avoid_urls
self.language = language
self.failed_log = open(self.__class__.__name__ + "_extraction_failed.log", "w")
self.failed_log = open("failed/" + self.__class__.__name__ + "_extraction_failed.log", "w")
self.annotator = EntityFishingAnnotator(configuration.annotator_uri)

def get_all_claims(self):
Expand Down
11 changes: 8 additions & 3 deletions claim_extractor/extractors/afpfactcheck.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,8 @@ def retrieve_urls(self, parsed_listing_page: BeautifulSoup, listing_page_url: st
str]:
urls = self.extract_urls(parsed_listing_page)
for page_number in trange(1, number_of_pages):
if ((page_number*15) + 14 >= self.configuration.maxClaims):
break
url = listing_page_url + "?page=" + str(int(page_number))
page = caching.get(url, headers=self.headers, timeout=20)
current_parsed_listing_page = BeautifulSoup(page, "lxml")
Expand Down Expand Up @@ -102,6 +104,8 @@ def extract_claim_and_review(self, parsed_claim_review_page: BeautifulSoup, url:
claim.set_date(data['@graph'][0]['itemReviewed']['datePublished'])
except Exception:
pass
except KeyError:
pass

try:
date = data['@graph'][0]['datePublished']
Expand All @@ -118,9 +122,10 @@ def extract_claim_and_review(self, parsed_claim_review_page: BeautifulSoup, url:
try:
if child.name == 'aside':
continue
elems = child.findAll('a')
for elem in elems:
links.append(elem['href'])
if (child != "\n" and not " " ):
elems = child.findAll('a')
for elem in elems:
links.append(elem['href'])
except Exception as e:
continue
claim.set_refered_links(links)
Expand Down
269 changes: 139 additions & 130 deletions claim_extractor/extractors/africacheck.py

Large diffs are not rendered by default.

5 changes: 4 additions & 1 deletion claim_extractor/extractors/checkyourfact.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,14 @@ def retrieve_listing_page_urls(self) -> List[str]:
return ["https://checkyourfact.com/page/1/"]

def find_page_count(self, parsed_listing_page: BeautifulSoup) -> int:
count = 26
count = 1
url = "https://checkyourfact.com/page/" + str(count + 1)
result = caching.get(url, headers=self.headers, timeout=10)
if result:
while result:
# each page 20 articles:
if (((count+1)*20)-20 >= self.configuration.maxClaims):
break
count += 1
url = "https://checkyourfact.com/page/" + str(count)
result = caching.get(url, headers=self.headers, timeout=10)
Expand Down
2 changes: 1 addition & 1 deletion claim_extractor/extractors/fullfact.py
Original file line number Diff line number Diff line change
Expand Up @@ -219,7 +219,7 @@ def extract_claim_and_review(self, parsed_claim_review_page: BeautifulSoup, url:

# Create multiple claims from the main one and add change then the claim text and verdict (rating):
c = 0
while c < len(claim_text_list)-1:
while c <= len(claim_text_list)-1:
claims.append(claim)
claims[c].claim = claim_text_list[c]
claims[c].rating = claim_verdict_list[c]
Expand Down
Loading

0 comments on commit afeb9c5

Please sign in to comment.