diff --git a/models/github_repo.py b/models/github_repo.py index 2a3c7794..e20fbb93 100644 --- a/models/github_repo.py +++ b/models/github_repo.py @@ -20,6 +20,7 @@ from util import elapsed from util import safe_commit from time import time +import datetime import ast import subprocess import re @@ -48,6 +49,9 @@ class GithubRepo(db.Model): lib_matches_raw = deferred(db.Column(JSONB)) lib_matches_final = deferred(db.Column(JSONB)) + created = db.Column(db.DateTime) + updated = db.Column(db.DateTime) + # old, and removed from current database. only in backups of database. # requirements = db.Column(JSONB) # reqs_file = deferred(db.Column(db.Text)) @@ -62,6 +66,16 @@ class GithubRepo(db.Model): # setup_py_no_forks = deferred(db.Column(db.Text)) + def __init__(self, login=None, repo_name=None, language=None): + self.login = login + self.repo_name = repo_name + self.language = language + self.id = u'{}:{}'.format(self.login, repo_name) + self.created = datetime.datetime.utcnow() + self.updated = datetime.datetime.utcnow() + super(GithubRepo, self).__init__() + + def __repr__(self): return u''.format( language=self.language, login=self.login, repo_name=self.repo_name) diff --git a/refresh.py b/refresh.py index 2b0fb112..2b32367c 100644 --- a/refresh.py +++ b/refresh.py @@ -1,10 +1,12 @@ import requests -from lxml import html import argparse +import datetime from app import db from models.cran_package import CranPackage from models.pypi_package import PypiPackage +from models.github_repo import GithubRepo +from models.github_api import make_ratelimited_call import update from util import safe_commit @@ -29,15 +31,35 @@ def add_all_new_packages(package_class): def add_all_new_github_repos(language): + all_current_github_repo_rows = db.session.query(GithubRepo.id).filter(GithubRepo.language==language).all() + all_current_github_repo_ids = [row[0] for row in all_current_github_repo_rows] + + end_date = datetime.datetime(2015, 11, 01) + start_date = datetime.datetime.utcnow() date = start_date - while date <= end_date: - next_date = date + timdelta(days=1) - url_template = "https://api.github.com/search/repositories?q=created:%22{date}%20..%20{next_date}%22%20language:{language}&per_page=1000&sort=forks&order=desc" + while date >= end_date: + prev_date = date - datetime.timedelta(days=1) + # The sort field. One of stars, forks, or updated. + # max of 100 returned + # authenticated rate limit: 30/min + url_template = "https://api.github.com/search/repositories?q=created:%22{prev_date}%20..%20{date}%22%20language:{language}&per_page=1000&sort=stars&order=desc" url = url_template.format( - language=language, date=date, next_date=next_date) - r = requests.get(url) - data = r.json() - date = next_date + language=language, date=date.isoformat()[0:10], prev_date=prev_date.isoformat()[0:10]) + print url + data = make_ratelimited_call(url) + print date.isoformat()[0:10], data["total_count"], data["incomplete_results"] + date = prev_date + for repo_dict in data["items"]: + new_repo = GithubRepo(login=repo_dict["owner"]["login"], repo_name=repo_dict["name"], language=language) + new_repo.api_raw = repo_dict + print "new_repo:", new_repo + if new_repo.id not in all_current_github_repo_ids: + print "added new repo from {}: {}\n".format(date.isoformat()[0:10], new_repo.id) + db.session.add(new_repo) + all_current_github_repo_ids.append(new_repo.id) + safe_commit(db) + + def recalculate_everything(parsed_args): @@ -51,6 +73,17 @@ def recalculate_everything(parsed_args): update.run_update(parsed_args) +def refresh(parsed_args): + if parsed_args.language=="r": + package_class = CranPackage + else: + package_class = PypiPackage + + parsed_args.fn = u"{}.refresh".format(package_class.__name__) + print "parsed_args.fn", parsed_args.fn + update.run_update(parsed_args) + + if __name__ == '__main__': parser = argparse.ArgumentParser(description="Run stuff.") @@ -58,8 +91,11 @@ def recalculate_everything(parsed_args): parsed_args = update.parse_update_optional_args(parser) + add_all_new_github_repos(parsed_args.language) + + # add_all_new_packages(PypiPackage) # add_all_new_packages(CranPackage) - add_all_new_packages(PypiPackage) + # start_date = "" # end_date = "" diff --git a/test/utils.py b/test/utils.py index 50265c13..d4b1991a 100644 --- a/test/utils.py +++ b/test/utils.py @@ -1,5 +1,5 @@ from sqlalchemy.exc import OperationalError -from sqlalchemy.sql import text +from sqlalchemy.sql import text import redis import os diff --git a/util.py b/util.py index b57a1f19..be274a7b 100644 --- a/util.py +++ b/util.py @@ -1,5 +1,6 @@ import time import bisect +import sqlalchemy from app import db @@ -135,8 +136,8 @@ def safe_commit(db): except sqlalchemy.exc.DataError: db.session.rollback() print u"sqlalchemy.exc.DataError on commit. rolling back." - except Exception: - db.session.rollback() + except Exception as e: + print "error", e print u"generic exception in commit. rolling back." - logging.exception("commit error") + db.session.rollback() return False \ No newline at end of file