Skip to content

Commit

Permalink
add macos intel and macos m1 support
Browse files Browse the repository at this point in the history
  • Loading branch information
邹九 authored and 邹九 committed Dec 24, 2024
1 parent 1fd3f56 commit 65fae68
Show file tree
Hide file tree
Showing 2 changed files with 34 additions and 10 deletions.
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@
3、保存**想法**到text并保存相应的图片,最后对所有text进行汇总到一个档案<br>

## LOG
-----------20241224 add macos intel and macos m1 support

-----------20230729 使用beautifulsoup库来进行解析网页,使用起来更加稳定的而且排版更加贴近网页,之前是使用的selenium

-----------202306 上传
Expand Down
42 changes: 32 additions & 10 deletions crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
from zipfile import ZipFile
from bs4 import BeautifulSoup
import re
import platform

abspath = os.path.abspath(__file__)
filename = abspath.split(os.sep)[-1]
Expand Down Expand Up @@ -280,6 +281,7 @@ def crawl_think_links(driver:webdriver, username:str):
clock = clock[3 + 1:].replace(":", "_")
dirthink = os.path.join(thinkdir, clock)
if os.path.exists(dirthink):
print(f"{dirthink}已经爬取过了,不再重复爬取")
continue
os.makedirs(dirthink, exist_ok=True)
try:
Expand Down Expand Up @@ -357,8 +359,8 @@ def cleartxt(kkk):
def parser_beautiful(innerHTML, article, number, dircrea, bk=False):
if not innerHTML:
return article, number
if bk:
article += "**"
# if bk:
# article += "**"
if isinstance(innerHTML, str):
article += innerHTML.text
return article, number
Expand Down Expand Up @@ -504,8 +506,8 @@ def parser_beautiful(innerHTML, article, number, dircrea, bk=False):
else:
article, number = parser_beautiful(chi, article, number, dircrea, bk)
article += "\n\n"
if bk:
article += "**"
# if bk:
# article += "**"
article = article.replace("\n\n\n\n\n", "\n\n")
article = article.replace("\n\n\n\n", "\n\n")
article = article.replace("\n\n\n", "\n\n")
Expand Down Expand Up @@ -694,6 +696,7 @@ def crawl_article_detail(driver:webdriver):
if kkk > 0:
break
if kkk > 0:
print(f"{os.path.join(dircol, j)}已经爬取过了,不再重复爬取")
continue
dircrea = os.path.join(articledir, temp_name)
os.makedirs(dircrea, exist_ok = True)
Expand Down Expand Up @@ -911,6 +914,7 @@ def crawl_answer_detail(driver:webdriver):
if kkk > 0:
break
if kkk > 0:
print(f"{os.path.join(dircol, j)}已经爬取过了,不再重复爬取")
continue

dircrea = os.path.join(answerdir, temp_name)
Expand Down Expand Up @@ -1097,14 +1101,26 @@ def downloaddriver():
ret = BeautifulSoup(ret.content, 'html.parser')
# divall = ret.find_all('div', class_=r'common-card--lightblue')
ddl = ret.find_all('a')
name = "msedgedriver.exe"
for k in ddl:
key = k.attrs.keys()
if 'href' not in key:
continue
href = k.attrs['href']
if 'href' in key and "win64" in href and ".zip" in href:
url = href
break
if 'darwin' not in sys.platform:
if 'href' in key and "win64" in href and ".zip" in href:
url = href
break
elif 'darwin' in sys.platform and 'arm' not in platform.processor():
if 'href' in key and "mac64" in href and "m1" not in href and ".zip" in href:
url = href
name = "msedgedriver"
break
elif 'darwin' in sys.platform and 'arm' in platform.processor():
if 'href' in key and "mac64_m1" in href and ".zip" in href:
url = href
name = "msedgedriver"
break
response = requests.get(url)
if response.status_code==200:
with open(os.path.join(abspath, 'msedgedriver/edgedriver.zip'), 'wb') as obj:
Expand All @@ -1115,12 +1131,15 @@ def downloaddriver():
for r, d, f in os.walk(nth):
kk = 6
for i in f:
if 'driver' in i and '.exe' in i:
if 'driver' in i and '.zip' not in i:
try:
shutil.move(os.path.join(r, i), os.path.join(nth, i))
except:
pass
os.rename(os.path.join(nth, i), os.path.join(nth, "msedgedriver.exe"))
os.rename(os.path.join(nth, i), os.path.join(nth, name))
if 'darwin' in sys.platform:
print(f"\n\n请执行权限操作再继续执行:\nchmod +x {os.path.join(nth, name)}\n")
exit(0)
kk = -6
break
if kk < 0:
Expand Down Expand Up @@ -1188,7 +1207,10 @@ def zhihu():

if __name__ == "__main__":
#version four.one_zero.zero
driverpath = os.path.join(abspath, 'msedgedriver\msedgedriver.exe')
if 'darwin' not in sys.platform:
driverpath = os.path.join(abspath, 'msedgedriver' +os.sep + 'msedgedriver.exe')
else:
driverpath = os.path.join(abspath, 'msedgedriver' +os.sep + 'msedgedriver')
savepath = deepcopy(abspath)
cookiedir = os.path.join(savepath, 'cookie')
thinkdir = os.path.join(savepath, 'think')
Expand Down

0 comments on commit 65fae68

Please sign in to comment.