1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93
|
import ssl import time import warnings import tldextract
import requests from bs4 import BeautifulSoup
warnings.filterwarnings("ignore") ssl._create_default_https_context = ssl._create_unverified_context
headers = { 'Host': 'www.butian.net', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2', 'Accept-Encoding': 'gzip, deflate', 'Connection': 'close', 'Referer': 'https://www.butian.net/Reward/plan/2', 'Cookie': 'YourCookie', 'Upgrade-Insecure-Requests': '1', 'Pragma': 'no-cache', 'Cache-Control': 'no-cache' }
def parse_page(data): datas = data['data']['list'] for d in datas: url = "https://www.butian.net/Loo/submit?cid=" + d['company_id'] website = parse_submit(url) val = tldextract.extract(website) save(val.registered_domain) time.sleep(1)
def parse_submit(url): try: webdata = requests.get(url=url, headers=headers) webdata.encoding = webdata.apparent_encoding soup = BeautifulSoup(webdata.text, 'html.parser') website = soup.find_all('input', class_='input-xlarge')[1]['value'] return website except Exception as e: print(f"爬取{url}错误: {e}") time.sleep(2) return parse_submit(url)
def save(msg): with open("result.txt", "a",encoding='utf-8') as f: f.write(msg) f.write("\n")
def get_end_page(): """ 看一下总共有多少页 :return: int """ url = 'https://www.butian.net/Reward/pub' data = { 's': 1, 'p': 1 } r = requests.post(url=url, data=data) return int(r.json()["data"]["count"])
def main(): end_page = get_end_page() for i in range(1, end_page + 1): print(f'目前获取第 {i} 页') url = 'https://www.butian.net/Reward/pub' data = { 's': 1, 'p': i } r = requests.post(url=url, data=data) parse_page(r.json()) time.sleep(3)
if __name__ == '__main__': main() print("补天公益SRC厂商URL获取结束...")
|