feat: Add X profiles scraping functionality and unstar. + Project str…

…ucture.
Errahum · Aug 22, 2024 · f6817aa · f6817aa
1 parent 9425185
commit f6817aa
Show file tree

Hide file tree

Showing 13 changed files with 280 additions and 31 deletions.
diff --git a/.gitignore b/.gitignore
@@ -169,3 +169,8 @@ __pycache__
 job_ids.txt
 
 followings.jsonl
+X_profiles.jsonl
+linkedin_profiles.jsonl
+github_usernames.txt
+config_unfollow.json
+config_unfollow.json
diff --git a/main_console_follow_unfollow.py b/main_console_follow_unfollow.py
@@ -8,12 +8,14 @@ def main_console_follow_unfollow():
 
     main_app = MainFollowUnfollow(config)
     while True:
-        logger.info("\nChoose an action:")
-        logger.info("1. Follow people")
-        logger.info("2. Unfollow people")
-        logger.info("3. Follow back people")
-        logger.info("4. Linkedin profiles")
-        logger.info("5. Exit")
+        print("\nChoose an action:")
+        print("1. Follow people")
+        print("2. Unfollow people")
+        print("3. Follow back people")
+        print("4. Linkedin profiles")
+        print("5. X profiles")
+        print("6. Unstar non followers")
+        print("7. Exit")
 
         choice = input("Enter your choice: ")
 
@@ -26,6 +28,10 @@ def main_console_follow_unfollow():
         elif choice == '4':
             main_app.linkedin_profiles()
         elif choice == '5':
+            main_app.x_profiles()
+        elif choice == '6':
+            main_app.unstar_non_followers_repos()
+        elif choice == '7':
             logger.info("Exiting the program.")
             break
         else:

diff --git a/src/core/follow/__init__.py b/src/core/follow/__init__.py
diff --git a/src/core/follow.py → src/core/follow/follow.py b/src/core/follow.py → src/core/follow/follow.py
diff --git a/src/core/follow_back.py → src/core/follow/follow_back.py b/src/core/follow_back.py → src/core/follow/follow_back.py
diff --git a/src/core/get_following.py → src/core/follow/get_following.py b/src/core/get_following.py → src/core/follow/get_following.py
diff --git a/src/core/scrapper/__init__.py b/src/core/scrapper/__init__.py
diff --git a/src/core/linkedin.py → src/core/scrapper/linkedin.py b/src/core/linkedin.py → src/core/scrapper/linkedin.py
@@ -42,26 +42,6 @@ def _get_paginated_data(self, url):
 
         return data
 
-    def _extract_linkedin_profiles(self, followers):
-        linkedin_profiles = []
-        if followers is None:
-            logger.error("Followers list is None, cannot scrape LinkedIn profiles.")
-            return linkedin_profiles
-
-        for follower in followers:
-            if follower is None:
-                logger.warning("Encountered None follower, skipping.")
-                continue
-
-            social_accounts_url = f"https://api.github.com/users/{follower['login']}/social_accounts"
-            response = self._make_request('GET', social_accounts_url)
-            if response.status_code == 200:
-                social_accounts = response.json()
-                linkedin_url = next((account['url'] for account in social_accounts if 'linkedin.com' in account['url']), None)
-                if linkedin_url:
-                    linkedin_profiles.append({'github_username': follower['login'], 'linkedin_url': linkedin_url})
-        return linkedin_profiles
-
     def _update_jsonl_file(self, linkedin_profiles):
         valid_profiles = [profile for profile in linkedin_profiles if self._is_valid_linkedin(profile['linkedin_url'])]
         with open(self.jsonl_file, 'w') as file:

diff --git a/src/core/scrapper/x.py b/src/core/scrapper/x.py
@@ -0,0 +1,167 @@
+import json
+import time
+import requests
+from src.utils.logger import logger
+
+class XScraper:
+    def __init__(self, config, max_accounts):
+        self.github_token = config.get_api_key()
+        self.headers = {
+            'Authorization': f'token {self.github_token}',
+            'Accept': 'application/vnd.github.v3+json'
+        }
+        self.jsonl_file = "X_profiles.jsonl"
+        self.max_accounts = max_accounts
+
+    def get_github_followers(self, username, max_accounts):
+        url = f'https://api.github.com/users/{username}/followers'
+        self.max_accounts = max_accounts
+        followers = self._get_paginated_data(url)
+        X_profiles = self._extract_X_profiles(followers)
+        self._update_jsonl_file(X_profiles)
+
+    def _get_paginated_data(self, url):
+        page = 1
+        data = []
+        while True:
+            paginated_url = f'{url}?page={page}'
+            response = self._make_request('GET', paginated_url)
+            if response.status_code != 200:
+                raise Exception(f"Error fetching data from {paginated_url}. Status code: {response.status_code}")
+
+            page_data = response.json()
+            if not page_data:
+                break
+
+            data.extend(page_data)
+            if len(data) >= self.max_accounts:
+                data = data[:self.max_accounts]
+                break
+
+            page += 1
+
+        return data
+
+    def _update_jsonl_file(self, X_profiles):
+        valid_profiles = [profile for profile in X_profiles if self._is_valid_X(profile['X_url'])]
+        with open(self.jsonl_file, 'w') as file:
+            for profile in valid_profiles:
+                file.write(json.dumps(profile) + '\n')
+
+    def _is_valid_X(self, X_url):
+        # Placeholder for actual X URL validation logic
+        return True
+
+    def _make_request(self, method, url):
+        for _ in range(3):
+            try:
+                response = requests.request(method, url, headers=self.headers)
+                if response.status_code in [500, 429]:
+                    time.sleep(10)
+                    continue
+                return response
+            except requests.RequestException as e:
+                logger.error(f"Request to {url} failed: {e}")
+                time.sleep(10)
+        return None
+
+    def _get_paginated_data(self, url):
+        page = 1
+        data = []
+        while True:
+            paginated_url = f'{url}?page={page}'
+            response = self._make_request('GET', paginated_url)
+            if response.status_code != 200:
+                raise Exception(f"Error fetching data from {paginated_url}. Status code: {response.status_code}")
+
+            page_data = response.json()
+            if not page_data:
+                break
+
+            data.extend(page_data)
+            if len(data) >= self.max_accounts:
+                data = data[:self.max_accounts]
+                break
+
+            page += 1
+
+        return data
+
+    def _extract_X_profiles(self, followers):
+        X_profiles = []
+        for follower in followers:
+            social_accounts_url = f"https://api.github.com/users/{follower['login']}/social_accounts"
+            response = self._make_request('GET', social_accounts_url)
+            if response.status_code == 200:
+                social_accounts = response.json()
+                X_url = next((account['url'] for account in social_accounts if 'x.com' in account['url']), None)
+                if X_url:
+                    X_profiles.append({'github_username': follower['login'], 'X_url': X_url})
+        return X_profiles
+
+    def _update_jsonl_file(self, X_profiles):
+        valid_profiles = [profile for profile in X_profiles if self._is_valid_X(profile['X_url'])]
+        with open(self.jsonl_file, 'w') as file:
+            for profile in valid_profiles:
+                file.write(json.dumps(profile) + '\n')
+
+    def _is_valid_X(self, X_url):
+        # Placeholder for actual X URL validation logic
+        return True
+
+    def _make_request(self, method, url):
+        for _ in range(3):
+            try:
+                response = requests.request(method, url, headers=self.headers)
+                if response.status_code in [500, 429]:
+                    time.sleep(10)
+                    continue
+                return response
+            except requests.RequestException as e:
+                logger.error(f"Request to {url} failed: {e}")
+                time.sleep(10)
+        return None
+
+    def scrape_X_profiles(self, followers):
+        if followers is None:
+            logger.error("Followers list is None, cannot scrape X profiles.")
+            return
+
+        X_profiles = []
+        existing_profiles = self.load_existing_profiles()
+
+        for follower in followers:
+            username = follower['login']
+            X_url = f"https://x.com/{username}"
+
+            if X_url in existing_profiles:
+                logger.info(f"X profile for {username} already exists, skipping.")
+                continue
+
+            X_profiles.append({
+                "github_username": username,
+                "X_url": X_url
+            })
+
+            if len(X_profiles) >= self.max_accounts:
+                break
+
+        self.save_profiles_to_jsonl(X_profiles)
+
+    def load_existing_profiles(self):
+        existing_profiles = set()
+        try:
+            with open(self.jsonl_file, 'r') as file:
+                for line in file:
+                    data = json.loads(line.strip())
+                    existing_profiles.add(data['X_url'])
+        except FileNotFoundError:
+            pass
+        return existing_profiles
+
+    def save_profiles_to_jsonl(self, profiles):
+        with open(self.jsonl_file, 'a') as file:
+            for profile in profiles:
+                json.dump(profile, file)
+                file.write('\n')
+        logger.info(f"Saved {len(profiles)} new X profiles to {self.jsonl_file}.")
diff --git a/src/core/undo/__init__.py b/src/core/undo/__init__.py
diff --git a/src/core/unfollow.py → src/core/undo/unfollow.py b/src/core/unfollow.py → src/core/undo/unfollow.py
diff --git a/src/core/undo/unstar.py b/src/core/undo/unstar.py
@@ -0,0 +1,74 @@
+import json
+import time
+from src.utils.logger import logger
+import requests
+
+class GitHubClientUnstar:
+    def __init__(self, config, username):
+        self.token = config.get_api_key()
+        self.headers = {
+            'Authorization': f'token {self.token}',
+            'Accept': 'application/vnd.github.v3+json'
+        }
+        self.username = username
+
+    def get_starred_repos(self):
+        return self._get_paginated_data(f'https://api.github.com/users/{self.username}/starred')
+
+    def get_followers(self):
+        return self._get_paginated_data(f'https://api.github.com/users/{self.username}/followers')
+
+    def _get_paginated_data(self, url):
+        page = 1
+        data = []
+        while True:
+            paginated_url = f'{url}?page={page}'
+            response = self._make_request('GET', paginated_url)
+
+            if response.status_code != 200:
+                raise Exception(f"Error fetching data from {paginated_url}. Status code: {response.status_code}")
+
+            page_data = response.json()
+            if not page_data:
+                break  # No more data to fetch
+
+            data.extend(page_data)
+            page += 1
+
+        return data
+
+    def _make_request(self, method, url):
+        max_retries = 3
+        for attempt in range(max_retries):
+            try:
+                if method == 'GET':
+                    response = requests.get(url, headers=self.headers)
+                elif method == 'DELETE':
+                    response = requests.delete(url, headers=self.headers)
+                else:
+                    raise ValueError("Unsupported HTTP method")
+                return response
+            except requests.RequestException as e:
+                logger.error(f"Request failed: {e}")
+                time.sleep(2 ** attempt)
+        raise Exception(f"Failed to {method} {url} after {max_retries} attempts")
+
+    def unstar_repo(self, owner, repo):
+        url = f'https://api.github.com/user/starred/{owner}/{repo}'
+        response = self._make_request('DELETE', url)
+        if response.status_code == 204:
+            logger.info(f"Successfully unstarred {owner}/{repo}")
+        else:
+            logger.error(f"Failed to unstar {owner}/{repo}. Status code: {response.status_code}")
+
+    def unstar_non_followers_repos(self):
+        followers = {follower['login'] for follower in self.get_followers()}
+        starred_repos = self.get_starred_repos()
+
+        for repo in starred_repos:
+            owner = repo['owner']['login']
+            if owner == "Errahum":
+                logger.info(f"Skipping {owner}'s repo as it should never be unstarred")
+                continue
+            if owner not in followers:
+                self.unstar_repo(owner, repo['name'])
diff --git a/src/manager_follow_unfollow.py b/src/manager_follow_unfollow.py
@@ -1,11 +1,14 @@
 import json
 
-from src.core.follow import GitHubClientFollow, FollowerManager, extract_username_from_url
-from src.core.follow_back import GitHubClientFollowBack, FollowBackFollowers
-from src.core.get_following import GitHubClientGetFollowings
-from src.core.unfollow import UnfollowBot, GitHubClientUnfollow
+from src.core.follow.follow import GitHubClientFollow, FollowerManager, extract_username_from_url
+from src.core.follow.follow_back import GitHubClientFollowBack, FollowBackFollowers
+from src.core.follow.get_following import GitHubClientGetFollowings
+from src.core.undo.unfollow import UnfollowBot, GitHubClientUnfollow
 from src.utils.logger import logger
-from src.core.linkedin import GitHubLinkedInScraper
+from src.core.scrapper.linkedin import GitHubLinkedInScraper
+from src.core.scrapper.x import XScraper
+from src.core.undo.unstar import GitHubClientUnstar
+
 
 class MainFollowUnfollow:
     def __init__(self, config):
@@ -19,6 +22,8 @@ def __init__(self, config):
         self.github_client_follow_back = GitHubClientFollowBack(config)
         self.github_client_unfollow = UnfollowBot(self.github_client_unfollow, self.username)
         self.linkedin_scraper = GitHubLinkedInScraper(config, max_accounts=0)
+        self.x_scraper = XScraper(config, max_accounts=0)
+        self.GitHubClientUnstar = GitHubClientUnstar(config, self.username)
 
     def follow_people(self):
         profile_url = input("Enter the GitHub profile URL: ")
@@ -89,3 +94,15 @@ def linkedin_profiles(self):
         else:
             followers = self.linkedin_scraper.get_github_followers(self.username, max_accounts)
             self.linkedin_scraper.scrape_linkedin_profiles(followers)
+
+    def x_profiles(self):
+        max_accounts = int(input("Enter the maximum number of X accounts: "))
+        if max_accounts <= 0:
+            logger.error("Invalid number of accounts.")
+        else:
+            followers = self.x_scraper.get_github_followers(self.username, max_accounts)
+            self.x_scraper.scrape_X_profiles(followers)
+
+    def unstar_non_followers_repos(self):
+        self.GitHubClientUnstar.unstar_non_followers_repos()
+        logger.info("Unstar process complete.")