Merge pull request #38 from Slashdacoda/afpfactcheck_2021_NormalizeCr…

…edibility merge to master to update
claimskg · Jul 28, 2021 · a05f794 · a05f794
2 parents acf1301 + b057d37
commit a05f794
Show file tree

Hide file tree

Showing 4 changed files with 120 additions and 31 deletions.
diff --git a/.gitignore b/.gitignore
@@ -21,6 +21,7 @@ PolitifactFactCheckingSiteExtractor_extraction_failed.log
 TruthorfictionFactCheckingSiteExtractor_extraction_failed.log
 CheckyourfactFactCheckingSiteExtractor_extraction_failed.log
 AfricacheckFactCheckingSiteExtractor_extraction_failed.log
+AfpfactcheckFactCheckingSiteExtractor_extraction_failed.log
 output_dev_fatabyyano.csv
 output_dev_vishvasnews.csv
 output_dev_aap.csv
@@ -30,3 +31,4 @@ output_dev_politifact.csv
 output_dev_truthorfiction.csv
 output_dev_checkyourfact.csv
 output_dev_africacheck.csv
+output_dev_afpfactcheck.csv
diff --git a/claim_extractor/extractors/afpfactcheck.py b/claim_extractor/extractors/afpfactcheck.py
@@ -49,6 +49,8 @@ def retrieve_urls(self, parsed_listing_page: BeautifulSoup, listing_page_url: st
         str]:
         urls = self.extract_urls(parsed_listing_page)
         for page_number in trange(1, number_of_pages):
+            if ((page_number*15) + 14 >= self.configuration.maxClaims):
+                break
             url = listing_page_url + "?page=" + str(int(page_number))
             page = caching.get(url, headers=self.headers, timeout=20)
             current_parsed_listing_page = BeautifulSoup(page, "lxml")
@@ -102,6 +104,8 @@ def extract_claim_and_review(self, parsed_claim_review_page: BeautifulSoup, url:
             claim.set_date(data['@graph'][0]['itemReviewed']['datePublished'])
         except Exception:
             pass
+        except KeyError:
+            pass
 
         try:
             date = data['@graph'][0]['datePublished']
@@ -118,9 +122,10 @@ def extract_claim_and_review(self, parsed_claim_review_page: BeautifulSoup, url:
             try:
                 if child.name == 'aside':
                     continue
-                elems = child.findAll('a')
-                for elem in elems:
-                    links.append(elem['href'])
+                if (child != "\n" and not " " ):
+                    elems = child.findAll('a')
+                    for elem in elems:
+                        links.append(elem['href'])
             except Exception as e:
                 continue
         claim.set_refered_links(links)