Skip to content

Commit

Permalink
Merge pull request #38 from Slashdacoda/afpfactcheck_2021_NormalizeCr…
Browse files Browse the repository at this point in the history
…edibility

merge to master to update
  • Loading branch information
Slashdacoda authored Jul 28, 2021
2 parents acf1301 + b057d37 commit a05f794
Show file tree
Hide file tree
Showing 4 changed files with 120 additions and 31 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ PolitifactFactCheckingSiteExtractor_extraction_failed.log
TruthorfictionFactCheckingSiteExtractor_extraction_failed.log
CheckyourfactFactCheckingSiteExtractor_extraction_failed.log
AfricacheckFactCheckingSiteExtractor_extraction_failed.log
AfpfactcheckFactCheckingSiteExtractor_extraction_failed.log
output_dev_fatabyyano.csv
output_dev_vishvasnews.csv
output_dev_aap.csv
Expand All @@ -30,3 +31,4 @@ output_dev_politifact.csv
output_dev_truthorfiction.csv
output_dev_checkyourfact.csv
output_dev_africacheck.csv
output_dev_afpfactcheck.csv
11 changes: 8 additions & 3 deletions claim_extractor/extractors/afpfactcheck.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,8 @@ def retrieve_urls(self, parsed_listing_page: BeautifulSoup, listing_page_url: st
str]:
urls = self.extract_urls(parsed_listing_page)
for page_number in trange(1, number_of_pages):
if ((page_number*15) + 14 >= self.configuration.maxClaims):
break
url = listing_page_url + "?page=" + str(int(page_number))
page = caching.get(url, headers=self.headers, timeout=20)
current_parsed_listing_page = BeautifulSoup(page, "lxml")
Expand Down Expand Up @@ -102,6 +104,8 @@ def extract_claim_and_review(self, parsed_claim_review_page: BeautifulSoup, url:
claim.set_date(data['@graph'][0]['itemReviewed']['datePublished'])
except Exception:
pass
except KeyError:
pass

try:
date = data['@graph'][0]['datePublished']
Expand All @@ -118,9 +122,10 @@ def extract_claim_and_review(self, parsed_claim_review_page: BeautifulSoup, url:
try:
if child.name == 'aside':
continue
elems = child.findAll('a')
for elem in elems:
links.append(elem['href'])
if (child != "\n" and not " " ):
elems = child.findAll('a')
for elem in elems:
links.append(elem['href'])
except Exception as e:
continue
claim.set_refered_links(links)
Expand Down
Loading

0 comments on commit a05f794

Please sign in to comment.