Skip to content

Commit

Permalink
feat: Add X profiles scraping functionality and unstar. + Project str…
Browse files Browse the repository at this point in the history
…ucture.
  • Loading branch information
Errahum committed Aug 22, 2024
1 parent 9425185 commit f6817aa
Show file tree
Hide file tree
Showing 13 changed files with 280 additions and 31 deletions.
5 changes: 5 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -169,3 +169,8 @@ __pycache__
job_ids.txt

followings.jsonl
X_profiles.jsonl
linkedin_profiles.jsonl
github_usernames.txt
config_unfollow.json
config_unfollow.json
18 changes: 12 additions & 6 deletions main_console_follow_unfollow.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,14 @@ def main_console_follow_unfollow():

main_app = MainFollowUnfollow(config)
while True:
logger.info("\nChoose an action:")
logger.info("1. Follow people")
logger.info("2. Unfollow people")
logger.info("3. Follow back people")
logger.info("4. Linkedin profiles")
logger.info("5. Exit")
print("\nChoose an action:")
print("1. Follow people")
print("2. Unfollow people")
print("3. Follow back people")
print("4. Linkedin profiles")
print("5. X profiles")
print("6. Unstar non followers")
print("7. Exit")

choice = input("Enter your choice: ")

Expand All @@ -26,6 +28,10 @@ def main_console_follow_unfollow():
elif choice == '4':
main_app.linkedin_profiles()
elif choice == '5':
main_app.x_profiles()
elif choice == '6':
main_app.unstar_non_followers_repos()
elif choice == '7':
logger.info("Exiting the program.")
break
else:
Expand Down
Empty file added src/core/follow/__init__.py
Empty file.
File renamed without changes.
File renamed without changes.
File renamed without changes.
Empty file added src/core/scrapper/__init__.py
Empty file.
20 changes: 0 additions & 20 deletions src/core/linkedin.py → src/core/scrapper/linkedin.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,26 +42,6 @@ def _get_paginated_data(self, url):

return data

def _extract_linkedin_profiles(self, followers):
linkedin_profiles = []
if followers is None:
logger.error("Followers list is None, cannot scrape LinkedIn profiles.")
return linkedin_profiles

for follower in followers:
if follower is None:
logger.warning("Encountered None follower, skipping.")
continue

social_accounts_url = f"https://api.github.com/users/{follower['login']}/social_accounts"
response = self._make_request('GET', social_accounts_url)
if response.status_code == 200:
social_accounts = response.json()
linkedin_url = next((account['url'] for account in social_accounts if 'linkedin.com' in account['url']), None)
if linkedin_url:
linkedin_profiles.append({'github_username': follower['login'], 'linkedin_url': linkedin_url})
return linkedin_profiles

def _update_jsonl_file(self, linkedin_profiles):
valid_profiles = [profile for profile in linkedin_profiles if self._is_valid_linkedin(profile['linkedin_url'])]
with open(self.jsonl_file, 'w') as file:
Expand Down
167 changes: 167 additions & 0 deletions src/core/scrapper/x.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,167 @@
import json
import time
import requests
from src.utils.logger import logger

class XScraper:
def __init__(self, config, max_accounts):
self.github_token = config.get_api_key()
self.headers = {
'Authorization': f'token {self.github_token}',
'Accept': 'application/vnd.github.v3+json'
}
self.jsonl_file = "X_profiles.jsonl"
self.max_accounts = max_accounts

def get_github_followers(self, username, max_accounts):
url = f'https://api.github.com/users/{username}/followers'
self.max_accounts = max_accounts
followers = self._get_paginated_data(url)
X_profiles = self._extract_X_profiles(followers)
self._update_jsonl_file(X_profiles)

def _get_paginated_data(self, url):
page = 1
data = []
while True:
paginated_url = f'{url}?page={page}'
response = self._make_request('GET', paginated_url)
if response.status_code != 200:
raise Exception(f"Error fetching data from {paginated_url}. Status code: {response.status_code}")

page_data = response.json()
if not page_data:
break

data.extend(page_data)
if len(data) >= self.max_accounts:
data = data[:self.max_accounts]
break

page += 1

return data

def _update_jsonl_file(self, X_profiles):
valid_profiles = [profile for profile in X_profiles if self._is_valid_X(profile['X_url'])]
with open(self.jsonl_file, 'w') as file:
for profile in valid_profiles:
file.write(json.dumps(profile) + '\n')

def _is_valid_X(self, X_url):
# Placeholder for actual X URL validation logic
return True

def _make_request(self, method, url):
for _ in range(3):
try:
response = requests.request(method, url, headers=self.headers)
if response.status_code in [500, 429]:
time.sleep(10)
continue
return response
except requests.RequestException as e:
logger.error(f"Request to {url} failed: {e}")
time.sleep(10)
return None

def _get_paginated_data(self, url):
page = 1
data = []
while True:
paginated_url = f'{url}?page={page}'
response = self._make_request('GET', paginated_url)
if response.status_code != 200:
raise Exception(f"Error fetching data from {paginated_url}. Status code: {response.status_code}")

page_data = response.json()
if not page_data:
break

data.extend(page_data)
if len(data) >= self.max_accounts:
data = data[:self.max_accounts]
break

page += 1

return data

def _extract_X_profiles(self, followers):
X_profiles = []
for follower in followers:
social_accounts_url = f"https://api.github.com/users/{follower['login']}/social_accounts"
response = self._make_request('GET', social_accounts_url)
if response.status_code == 200:
social_accounts = response.json()
X_url = next((account['url'] for account in social_accounts if 'x.com' in account['url']), None)
if X_url:
X_profiles.append({'github_username': follower['login'], 'X_url': X_url})
return X_profiles

def _update_jsonl_file(self, X_profiles):
valid_profiles = [profile for profile in X_profiles if self._is_valid_X(profile['X_url'])]
with open(self.jsonl_file, 'w') as file:
for profile in valid_profiles:
file.write(json.dumps(profile) + '\n')

def _is_valid_X(self, X_url):
# Placeholder for actual X URL validation logic
return True

def _make_request(self, method, url):
for _ in range(3):
try:
response = requests.request(method, url, headers=self.headers)
if response.status_code in [500, 429]:
time.sleep(10)
continue
return response
except requests.RequestException as e:
logger.error(f"Request to {url} failed: {e}")
time.sleep(10)
return None

def scrape_X_profiles(self, followers):
if followers is None:
logger.error("Followers list is None, cannot scrape X profiles.")
return

X_profiles = []
existing_profiles = self.load_existing_profiles()

for follower in followers:
username = follower['login']
X_url = f"https://x.com/{username}"

if X_url in existing_profiles:
logger.info(f"X profile for {username} already exists, skipping.")
continue

X_profiles.append({
"github_username": username,
"X_url": X_url
})

if len(X_profiles) >= self.max_accounts:
break

self.save_profiles_to_jsonl(X_profiles)

def load_existing_profiles(self):
existing_profiles = set()
try:
with open(self.jsonl_file, 'r') as file:
for line in file:
data = json.loads(line.strip())
existing_profiles.add(data['X_url'])
except FileNotFoundError:
pass
return existing_profiles

def save_profiles_to_jsonl(self, profiles):
with open(self.jsonl_file, 'a') as file:
for profile in profiles:
json.dump(profile, file)
file.write('\n')
logger.info(f"Saved {len(profiles)} new X profiles to {self.jsonl_file}.")
Empty file added src/core/undo/__init__.py
Empty file.
File renamed without changes.
74 changes: 74 additions & 0 deletions src/core/undo/unstar.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
import json
import time
from src.utils.logger import logger
import requests

class GitHubClientUnstar:
def __init__(self, config, username):
self.token = config.get_api_key()
self.headers = {
'Authorization': f'token {self.token}',
'Accept': 'application/vnd.github.v3+json'
}
self.username = username

def get_starred_repos(self):
return self._get_paginated_data(f'https://api.github.com/users/{self.username}/starred')

def get_followers(self):
return self._get_paginated_data(f'https://api.github.com/users/{self.username}/followers')

def _get_paginated_data(self, url):
page = 1
data = []
while True:
paginated_url = f'{url}?page={page}'
response = self._make_request('GET', paginated_url)

if response.status_code != 200:
raise Exception(f"Error fetching data from {paginated_url}. Status code: {response.status_code}")

page_data = response.json()
if not page_data:
break # No more data to fetch

data.extend(page_data)
page += 1

return data

def _make_request(self, method, url):
max_retries = 3
for attempt in range(max_retries):
try:
if method == 'GET':
response = requests.get(url, headers=self.headers)
elif method == 'DELETE':
response = requests.delete(url, headers=self.headers)
else:
raise ValueError("Unsupported HTTP method")
return response
except requests.RequestException as e:
logger.error(f"Request failed: {e}")
time.sleep(2 ** attempt)
raise Exception(f"Failed to {method} {url} after {max_retries} attempts")

def unstar_repo(self, owner, repo):
url = f'https://api.github.com/user/starred/{owner}/{repo}'
response = self._make_request('DELETE', url)
if response.status_code == 204:
logger.info(f"Successfully unstarred {owner}/{repo}")
else:
logger.error(f"Failed to unstar {owner}/{repo}. Status code: {response.status_code}")

def unstar_non_followers_repos(self):
followers = {follower['login'] for follower in self.get_followers()}
starred_repos = self.get_starred_repos()

for repo in starred_repos:
owner = repo['owner']['login']
if owner == "Errahum":
logger.info(f"Skipping {owner}'s repo as it should never be unstarred")
continue
if owner not in followers:
self.unstar_repo(owner, repo['name'])
27 changes: 22 additions & 5 deletions src/manager_follow_unfollow.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,14 @@
import json

from src.core.follow import GitHubClientFollow, FollowerManager, extract_username_from_url
from src.core.follow_back import GitHubClientFollowBack, FollowBackFollowers
from src.core.get_following import GitHubClientGetFollowings
from src.core.unfollow import UnfollowBot, GitHubClientUnfollow
from src.core.follow.follow import GitHubClientFollow, FollowerManager, extract_username_from_url
from src.core.follow.follow_back import GitHubClientFollowBack, FollowBackFollowers
from src.core.follow.get_following import GitHubClientGetFollowings
from src.core.undo.unfollow import UnfollowBot, GitHubClientUnfollow
from src.utils.logger import logger
from src.core.linkedin import GitHubLinkedInScraper
from src.core.scrapper.linkedin import GitHubLinkedInScraper
from src.core.scrapper.x import XScraper
from src.core.undo.unstar import GitHubClientUnstar


class MainFollowUnfollow:
def __init__(self, config):
Expand All @@ -19,6 +22,8 @@ def __init__(self, config):
self.github_client_follow_back = GitHubClientFollowBack(config)
self.github_client_unfollow = UnfollowBot(self.github_client_unfollow, self.username)
self.linkedin_scraper = GitHubLinkedInScraper(config, max_accounts=0)
self.x_scraper = XScraper(config, max_accounts=0)
self.GitHubClientUnstar = GitHubClientUnstar(config, self.username)

def follow_people(self):
profile_url = input("Enter the GitHub profile URL: ")
Expand Down Expand Up @@ -89,3 +94,15 @@ def linkedin_profiles(self):
else:
followers = self.linkedin_scraper.get_github_followers(self.username, max_accounts)
self.linkedin_scraper.scrape_linkedin_profiles(followers)

def x_profiles(self):
max_accounts = int(input("Enter the maximum number of X accounts: "))
if max_accounts <= 0:
logger.error("Invalid number of accounts.")
else:
followers = self.x_scraper.get_github_followers(self.username, max_accounts)
self.x_scraper.scrape_X_profiles(followers)

def unstar_non_followers_repos(self):
self.GitHubClientUnstar.unstar_non_followers_repos()
logger.info("Unstar process complete.")

0 comments on commit f6817aa

Please sign in to comment.