【小工具】补天公益SRC域名爬取

代码取自于https://github.com/saucer-man/penetration-script/blob/master/butian_spider.py

小小的改动了一下,使输出只包含厂商的根域名,方便后续域名收集

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
#!/usr/bin/python3
# -*- coding: utf-8 -*-


import ssl
import time
import warnings
import tldextract

import requests
from bs4 import BeautifulSoup

warnings.filterwarnings("ignore")
ssl._create_default_https_context = ssl._create_unverified_context

headers = {
'Host': 'www.butian.net',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
'Accept-Encoding': 'gzip, deflate',
'Connection': 'close',
'Referer': 'https://www.butian.net/Reward/plan/2',
'Cookie': 'YourCookie',
'Upgrade-Insecure-Requests': '1',
'Pragma': 'no-cache',
'Cache-Control': 'no-cache'
}


def parse_page(data):
datas = data['data']['list']
for d in datas:
# 获取厂商url
url = "https://www.butian.net/Loo/submit?cid=" + d['company_id']
website = parse_submit(url)
val = tldextract.extract(website)
#save(url + " " + d['company_name'] + " " + website)
save(val.registered_domain)
time.sleep(1)


def parse_submit(url):
try:
# 获取厂商url
webdata = requests.get(url=url, headers=headers)
webdata.encoding = webdata.apparent_encoding
soup = BeautifulSoup(webdata.text, 'html.parser')
website = soup.find_all('input', class_='input-xlarge')[1]['value']
return website
except Exception as e:
print(f"爬取{url}错误: {e}")
time.sleep(2)
return parse_submit(url)


def save(msg):
with open("result.txt", "a",encoding='utf-8') as f:
f.write(msg)
f.write("\n")


def get_end_page():
"""
看一下总共有多少页
:return: int
"""
url = 'https://www.butian.net/Reward/pub'
data = {
's': 1,
'p': 1
}
r = requests.post(url=url, data=data)
return int(r.json()["data"]["count"])


def main():
end_page = get_end_page()
for i in range(1, end_page + 1):
print(f'目前获取第 {i} 页')
url = 'https://www.butian.net/Reward/pub'
data = {
's': 1,
'p': i
}
r = requests.post(url=url, data=data)
parse_page(r.json())
time.sleep(3)


if __name__ == '__main__':
main()
print("补天公益SRC厂商URL获取结束...")