html下载和转换

import os
import urllib
import urllib.request
import bs4
from bs4 import BeautifulSoup


def download_html(url):
    '''
    从服务器下载html,将其存放在当前文件夹下
    便于本地BeautifulSoup调试

    # BeautifulSoup解析本地html
    soup = BeautifulSoup(open("data.html",encoding="utf-8"), 'lxml')

    :param url: 进行下载网页的url
    :param headers:
    :return:
    '''
    req = urllib.request.Request(url)
    req.add_header('User-Agent',
                   'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.93 Safari/537.36')  # set user-agent header
    # req.add_header('Cookie', 'UM_dis3aa97')  # 更改cookie
    response = urllib.request.urlopen(req)
    # 获得当前路径
    path = os.getcwd() + '\\data.html'
    fo = open(path, "wb")
    fo.write(response.read())
    fo.close()


#
def to_html_by_response(response, name="data.html"):
    """
    :param name:
    :param response: type: http.client.HTTPResponse
    :return:
    """
    path = os.getcwd() + f'{name}'
    fo = open(path, "wb")
    fo.write(response.read())
    fo.close()


def bs4_local_html(filename) -> bs4.BeautifulSoup:
    with open('baike.html', 'rb') as f:
        doc_html = f.read()
    return BeautifulSoup(doc_html, "html.parser")


if __name__ == '__main__':
    url = "https://lishi.tianqi.com/wuhan/202105.html"
    download_html(url)