import os
import urllib
import urllib.request
import bs4
from bs4 import BeautifulSoup
def download_html(url):
'''
从服务器下载html,将其存放在当前文件夹下
便于本地BeautifulSoup调试
# BeautifulSoup解析本地html
soup = BeautifulSoup(open("data.html",encoding="utf-8"), 'lxml')
:param url: 进行下载网页的url
:param headers:
:return:
'''
req = urllib.request.Request(url)
req.add_header('User-Agent',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.93 Safari/537.36') # set user-agent header
# req.add_header('Cookie', 'UM_dis3aa97') # 更改cookie
response = urllib.request.urlopen(req)
# 获得当前路径
path = os.getcwd() + '\\data.html'
fo = open(path, "wb")
fo.write(response.read())
fo.close()
#
def to_html_by_response(response, name="data.html"):
"""
:param name:
:param response: type: http.client.HTTPResponse
:return:
"""
path = os.getcwd() + f'{name}'
fo = open(path, "wb")
fo.write(response.read())
fo.close()
def bs4_local_html(filename) -> bs4.BeautifulSoup:
with open('baike.html', 'rb') as f:
doc_html = f.read()
return BeautifulSoup(doc_html, "html.parser")
if __name__ == '__main__':
url = "https://lishi.tianqi.com/wuhan/202105.html"
download_html(url)