-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathChinaz.py
124 lines (108 loc) · 4.39 KB
/
Chinaz.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
import requests as req
from fake_useragent import UserAgent
from bs4 import BeautifulSoup as bs4
# 爬取Chinaz的SEO综合查询信息
def query_chinaz(domain):
url = f"https://seo.chinaz.com/{domain}"
ua = UserAgent()
headers = {
"User-Agent": ua.random
}
try:
resp = req.get(url, headers=headers)
if resp.status_code == 200:
result = get_data(resp.text)
return result
except Exception as e:
print(e)
# 获取icp备案号
def get_ICPNumber(domain):
url = f"https://icp.chinaz.com/index/api/queryPermit"
headers = {
'accept': 'application/json, text/javascript, */*; q=0.01',
'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8',
'content-type': 'application/json',
'cookie': 'cz_statistics_visitor=bde7196a-8a5d-4143-61f3-425596f9eac1; HMACCOUNT=8E13A9A559359C69; qHistory=Ly9pY3AuY2hpbmF6LmNvbS93d3cuZ3h1c3QuZWR1LmNuX+e9keermeWkh+ahiOafpeivog==; Hm_lvt_ca96c3507ee04e182fb6d097cb2a1a4c=1723623453,1725541986; Hm_lpvt_ca96c3507ee04e182fb6d097cb2a1a4c=1725541986; Hm_lvt_32e161892d770dca4a9d436a5764a01a=1725541986; Hm_lpvt_32e161892d770dca4a9d436a5764a01a=1725541986; _clck=n2xwl8%7C2%7Cfox%7C0%7C1687; _clsk=iz77x5%7C1725541987765%7C1%7C1%7Ct.clarity.ms%2Fcollect; JSESSIONID=1C80D90ECE0A4894D1DB1C873DA5D5DF',
'key': '482,896,846,2109',
'origin': 'https://icp.chinaz.com',
'priority': 'u=1, i',
'rd': '482',
'referer': 'https://icp.chinaz.com/gxust.edu.cn',
'sec-ch-ua': '"Chromium";v="128", "Not;A=Brand";v="24", "Google Chrome";v="128"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'sec-fetch-dest': 'empty',
'sec-fetch-mode': 'cors',
'sec-fetch-site': 'same-origin',
'token': '0a9de165ee93259c1ce72d44e2fc591b',
'ts': '1725541998465',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36',
'x-requested-with': 'XMLHttpRequest'
}
payload = {
"keyword": "gxust.edu.cn"
}
try:
resp = req.post(url, json=payload, headers=headers)
print(resp.status_code)
if resp.status_code == 200:
print(resp.json())
except Exception as e:
print(e)
# 获取HTML内容中的域名备案信息、所属单位等内容
def get_data(html):
# 解析 HTML 文档
soup = bs4(html, 'lxml')
try:
# 查找所有 class 为 color-63 的 <i> 元素
color_63_is = soup.find_all('i', class_='color-63')
all_texts = [i.get_text(strip=True) for i in color_63_is]
if len(all_texts) == 8:
# 域名所属单位
department = all_texts[1]
# 域名所属单位类型
department_type = all_texts[2]
else:
# 域名所属单位
department = all_texts[4]
# 域名所属单位类型
department_type = all_texts[5]
except Exception as e:
print(e)
# 域名所属单位
department = "暂无"
# 域名所属单位类型
department_type = "暂无"
try:
# IP地址 运营商
# <i class="color-63"><a href="//ip.tool.chinaz.com/?ip=202.103.240.11" target="_blank">202.103.240.11[中国广西南宁 电信]</a></i>
if color_63_is[-4]:
ip_location = str(color_63_is[-4]).split(
"</a>")[0].split('"_blank">')[1].split("[")[1].split("]")[0]
else:
ip_location = ""
except Exception as e:
ip_location = "暂无"
try:
span_color_2f87c1 = soup.find_all('i', class_='color-2f87c1')
# <i class="color-2f87c1"><a href="//icp.chinaz.com/nnsw.nanning.gov.cn" target="_blank">桂ICP备18008337号</a></i>
ICPNumber = str(span_color_2f87c1[1]).split(
"</a>")[0].split('"_blank">')[1]
except Exception as e:
ICPNumber = "暂无"
result = {
"department": department,
"department_type": department_type,
"ip_location": ip_location,
"ICPNumber": ICPNumber,
}
return result
if __name__ == "__main__":
# result = query_chinaz("www.tsinghua.edu.cn")
# print(result)
# result = query_chinaz("nnsw.nanning.gov.cn")
# print(result)
# result = query_chinaz("www.bilibili.com")
# print(result)
result = get_ICPNumber("www.10086.cn")
print(result)