-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgenerator.py
143 lines (111 loc) · 5.67 KB
/
generator.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
import argparse
import asyncio
import http
import logging
from typing import Sequence
import re
import aiohttp
from tqdm import tqdm
from more_itertools import divide
log = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)
CLICK_HOST = 'https://clickpy-clickhouse.clickhouse.com/?user=play'
CLICK_PARAMS = {'user': 'play'}
CLICK_QUERY = 'SELECT project FROM pypi.pypi_downloads GROUP BY project ORDER BY sum(count) DESC LIMIT %s FORMAT JSONCompactColumns'
BROKEN_MODULES = {
'apache-beam': 'https://github.com/astral-sh/uv/issues/3078',
'awscli': 'The pip is having problems with awscli, see '
'https://github.com/alphavector/all/actions/runs/11203830224/job/31141517215',
'awscli-cwlogs': 'The pip is having problems with awscli, see '
'https://github.com/alphavector/all/actions/runs/11203830224/job/31141517215',
'thrift': 'The uv has problems with thrift at a certain dependency order '
'(https://github.com/alphavector/all/actions/runs/11204390601/job/31142712430)'
', but the 1 time pass executed successfully '
'https://github.com/alphavector/all/actions/runs/11203830224/job/31141517284#step:5:569',
'pycrypto': 'pip: src/_fastmath.c:33:10: fatal error: longintrepr.h: No such file or directory',
'backports-zoneinfo': 'pip: lib/zoneinfo_module.c:600:19: error: ‘_PyLong_One’ undeclared (first use in this function); did you mean ‘_PyLong_New’?',
'sklearn': "The 'sklearn' PyPI package is deprecated, use 'scikit-learn' rather than 'sklearn' for pip commands.",
'pathtools': "ModuleNotFoundError: No module named 'imp'",
'functools32': 'This backport is for Python 2.7 only.',
'tfx-bsl': 'Ignored the following versions that require a different python version: ... Requires-Python',
'tensorflow-data-validation': 'Ignored the following versions that require a different python version: ... Requires-Python',
'pywin32': 'ignore windows modules',
# tensorflow
re.compile(r'^tensorflow.*'): ' tensorflow-text: No matching distribution found for tensorflow-text<=2.17.0',
'flask-appbuilder': 'flask-appbuilder==2.1.4 has a bug',
'bokeh': "AttributeError: module 'configparser' has no attribute 'SafeConfigParser'. Did you mean: 'RawConfigParser'?, "
"uv did it, but the others didn't, very strange.",
# Azure
re.compile(r'^azure.*'): 'azure-cli: jsmin==2.2.2: error in jsmin setup command: use_2to3 is invalid.',
'great-expectations': "AttributeError: module 'configparser' has no attribute 'SafeConfigParser'. Did you mean: 'RawConfigParser'?",
'tornado': "tornado-5.1.1: Tornado requires an up-to-date SSL module. This means Python 2.7.9+ or 3.4+",
}
EXTERNAL_MODULES = {
'pyyaml': '!=6.0.0,!=5.4.0,!=5.4.1', # pyyaml is broken with cython 3,
}
PYPI_API_BASE_URL = 'https://pypi.org'
async def get_latest_version(pbar: tqdm, q: asyncio.Queue, batch: Sequence[str]):
async with aiohttp.ClientSession(base_url=PYPI_API_BASE_URL) as session:
for name in batch:
async with session.get(f'/pypi/{name}/json') as resp:
if resp.status != http.HTTPStatus.OK:
pbar.update()
continue
data = await resp.json()
urls = data['urls']
# ignore 2c.py e.g.
if not urls:
pbar.update()
continue
latest_version = data['info']['version']
await q.put((name, latest_version))
pbar.update()
pbar.close()
async def consumer(q: asyncio.Queue, requirement: str):
with open(requirement, 'w') as fd:
while True:
msg = await q.get()
name, latest_version = msg
skip = False
for broken_name, reasone in BROKEN_MODULES.items():
if isinstance(broken_name, re.Pattern) and broken_name.match(name) or name == broken_name:
skip = True
continue
if skip:
fd.write(f'# {name}<={latest_version} # {reasone}\n')
elif (curstom_version := EXTERNAL_MODULES.get(name)) is not None:
fd.write(f'{name}{curstom_version}\n')
else:
fd.write(f'{name}<={latest_version}\n')
q.task_done()
async def generator(workers: int, requirement: str, limit: int = 100):
log.info('Download top %s modules', limit)
actual_limit = limit + len(BROKEN_MODULES)
async with aiohttp.ClientSession() as session:
async with session.get(CLICK_HOST, params=CLICK_PARAMS, data=CLICK_QUERY % (actual_limit,)) as resp:
assert resp.status == 200
data: dict = await resp.json(content_type='text/plain')
packages = data[0]
total = len(packages)
log.info('total: %s', total)
batches: tuple[tuple[str]] = tuple(tuple(x) for x in divide(workers, packages))
q = asyncio.Queue()
pbars = [tqdm(total=len(batch), position=i, desc=f'worker {i}', unit='pkgs') for i, batch in
enumerate(batches)]
asyncio.create_task(consumer(q, requirement=requirement)) # noqa
workers = (
get_latest_version(
pbar,
q,
batch
) for batch, pbar in zip(batches, pbars))
await asyncio.gather(*workers)
await q.join()
log.info('Done')
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument('-w', '--workers', type=int, default=100)
parser.add_argument('-r', '--requirement', type=str, default='requirements.txt')
parser.add_argument('-l', '--limit', type=int, default=100)
args = vars(parser.parse_args())
asyncio.run(generator(**args))