Merge pull request #42 from Slashdacoda/master

merge from master to update
claimskg · Jul 28, 2021 · afeb9c5 · afeb9c5
2 parents ab950c8 + 1d64d34
commit afeb9c5
Show file tree

Hide file tree

Showing 19 changed files with 374 additions and 223 deletions.
diff --git a/.gitignore b/.gitignore
@@ -18,11 +18,17 @@ VishvasnewsFactCheckingSiteExtractor_extraction_failed.log
 SnopesFactCheckingSiteExtractor_extraction_failed.log
 EuvsdisinfoFactCheckingSiteExtractor_extraction_failed.log
 PolitifactFactCheckingSiteExtractor_extraction_failed.log
+TruthorfictionFactCheckingSiteExtractor_extraction_failed.log
+CheckyourfactFactCheckingSiteExtractor_extraction_failed.log
+AfricacheckFactCheckingSiteExtractor_extraction_failed.log
+AfpfactcheckFactCheckingSiteExtractor_extraction_failed.log
 output_dev_fatabyyano.csv
 output_dev_vishvasnews.csv
 output_dev_aap.csv
 output_dev_fullfact.csv
 output_dev_snopes.csv
 output_dev_politifact.csv
-TruthorfictionFactCheckingSiteExtractor_extraction_failed.log
 output_dev_truthorfiction.csv
+output_dev_checkyourfact.csv
+output_dev_africacheck.csv
+output_dev_afpfactcheck.csv
diff --git a/Exporter.py b/Exporter.py
@@ -38,8 +38,8 @@ def main(argv):
             if opt == '--maxclaims':
                 criteria.maxClaims = int(arg)
                 if criteria.website != "":
-                    criteria.setOutputDev("output_dev_" + criteria.website + ".csv")
-                    criteria.setOutputSample("output_sample_" + criteria.website + ".csv")
+                    criteria.setOutputDev("samples/output_dev_" + criteria.website + ".csv")
+                    criteria.setOutputSample("samples/output_sample_" + criteria.website + ".csv")
             if opt == '--annotation-api':
                 criteria.annotator_uri = arg
 

diff --git a/claim_extractor/extractors/__init__.py b/claim_extractor/extractors/__init__.py
@@ -61,7 +61,7 @@ def __init__(self, configuration: Configuration = Configuration(), ignore_urls:
         self.configuration = configuration
         self.ignore_urls = configuration.avoid_urls
         self.language = language
-        self.failed_log = open(self.__class__.__name__ + "_extraction_failed.log", "w")
+        self.failed_log = open("failed/" + self.__class__.__name__ + "_extraction_failed.log", "w")
         self.annotator = EntityFishingAnnotator(configuration.annotator_uri)
 
     def get_all_claims(self):

diff --git a/claim_extractor/extractors/afpfactcheck.py b/claim_extractor/extractors/afpfactcheck.py
@@ -49,6 +49,8 @@ def retrieve_urls(self, parsed_listing_page: BeautifulSoup, listing_page_url: st
         str]:
         urls = self.extract_urls(parsed_listing_page)
         for page_number in trange(1, number_of_pages):
+            if ((page_number*15) + 14 >= self.configuration.maxClaims):
+                break
             url = listing_page_url + "?page=" + str(int(page_number))
             page = caching.get(url, headers=self.headers, timeout=20)
             current_parsed_listing_page = BeautifulSoup(page, "lxml")
@@ -102,6 +104,8 @@ def extract_claim_and_review(self, parsed_claim_review_page: BeautifulSoup, url:
             claim.set_date(data['@graph'][0]['itemReviewed']['datePublished'])
         except Exception:
             pass
+        except KeyError:
+            pass
 
         try:
             date = data['@graph'][0]['datePublished']
@@ -118,9 +122,10 @@ def extract_claim_and_review(self, parsed_claim_review_page: BeautifulSoup, url:
             try:
                 if child.name == 'aside':
                     continue
-                elems = child.findAll('a')
-                for elem in elems:
-                    links.append(elem['href'])
+                if (child != "\n" and not " " ):
+                    elems = child.findAll('a')
+                    for elem in elems:
+                        links.append(elem['href'])
             except Exception as e:
                 continue
         claim.set_refered_links(links)

diff --git a/claim_extractor/extractors/africacheck.py b/claim_extractor/extractors/africacheck.py
diff --git a/claim_extractor/extractors/checkyourfact.py b/claim_extractor/extractors/checkyourfact.py
@@ -19,11 +19,14 @@ def retrieve_listing_page_urls(self) -> List[str]:
         return ["https://checkyourfact.com/page/1/"]
 
     def find_page_count(self, parsed_listing_page: BeautifulSoup) -> int:
-        count = 26
+        count = 1
         url = "https://checkyourfact.com/page/" + str(count + 1)
         result = caching.get(url, headers=self.headers, timeout=10)
         if result:
             while result:
+                # each page 20 articles:
+                if (((count+1)*20)-20 >= self.configuration.maxClaims):
+                    break
                 count += 1
                 url = "https://checkyourfact.com/page/" + str(count)
                 result = caching.get(url, headers=self.headers, timeout=10)

diff --git a/claim_extractor/extractors/fullfact.py b/claim_extractor/extractors/fullfact.py
@@ -219,7 +219,7 @@ def extract_claim_and_review(self, parsed_claim_review_page: BeautifulSoup, url:
 
             # Create multiple claims from the main one and add change then the claim text and verdict (rating):
             c = 0
-            while c < len(claim_text_list)-1:
+            while c <= len(claim_text_list)-1:
                 claims.append(claim)
                 claims[c].claim = claim_text_list[c]
                 claims[c].rating = claim_verdict_list[c]