Python爬虫案例七：抓取南京公交信息数据并将其保存成excel多表形式

测试链接:

https://nanjing.8684.cn/line4

思路：先抓取某个类型下的某一条线路所有数据，然后实现批量,，列举出三个类型代表既可

源码：

from lxml import etree
from xlutils.copy import copy
import requests, os, xlrd, xlwt

def get_all():
    # 获取所有
    tynm_list = ['江南线路(1-399)', '江北线路(400-699)', '江宁线路(700-999)']
    tyid_list = [2, 3, 4]
    for tynm, tyid in zip(tynm_list, tyid_list):
        list_url = 'https://nanjing.8684.cn/line{}'.format(tyid)
        headers = {
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
            "Accept-Language": "zh-CN,zh;q=0.9",
            "Cache-Control": "no-cache",
            "Connection": "keep-alive",
            "Pragma": "no-cache",
            "Sec-Fetch-Dest": "document",
            "Sec-Fetch-Mode": "navigate",
            "Sec-Fetch-Site": "none",
            "Sec-Fetch-User": "?1",
            "Upgrade-Insecure-Requests": "1",
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36",
            "sec-ch-ua-mobile": "?0"
        }
        cookies = {
            "JSESSIONID": "48304F9E8D55A9F2F8ACC14B7EC5A02D",
            "wbf__voiceplg-is": "false",
            "tongue": "1",
            "Hm_lvt_c31f95cd1f1c01c74f44d211939ceb8c": "1712659199",
            "__gads": "ID",
            "__gpi": "UID",
            "__eoi": "ID",
            "SECKEY_ABVK": "2DPSFBW+PxohRgE9br/PasK97Oo+bbbcKQgQu9uxadc%3D",
            "BMAP_SECKEY": "XCSGTS0HVG9MJBd7qjmcuIneKymhvMCOXLg1JoYhcHTYNyZi_ZD1PkQ8wHX0_ycxbyF1QTuQWF68O-J3hMNYeSVrLdplIVuNxTyW1OaKt18bXNTDHrBSmsZ7DEMwNaY3o1qfZ-Gy932UGgUlRkYaQLMujMyT2eGMlHUKElpXgb3WIdgV2i4dGkFfMutvbhUKyxkjaWZMOhimPI5uGe2Zow",
            "Hm_lpvt_c31f95cd1f1c01c74f44d211939ceb8c": "1712671763"
        }
        response = requests.get(list_url, headers=headers, cookies=cookies).content.decode()
        # print(response)
        parse_all(response, tynm)

def parse_all(response, tynm):
    # 解析所有的线路ID
    A = etree.HTML(response)
    a_list = A.xpath('//div[@class="list clearfix"]/a')
    for a in a_list:
        xlid = a.xpath('./@href')[0]
        get_one(xlid, tynm)

def get_one(xlid, tynm):
    # 某一条线路
    one_url = 'https://nanjing.8684.cn{}'.format(xlid)
    headers = {
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
        "Accept-Language": "zh-CN,zh;q=0.9",
        "Cache-Control": "no-cache",
        "Connection": "keep-alive",
        "Pragma": "no-cache",
        "Sec-Fetch-Dest": "document",
        "Sec-Fetch-Mode": "navigate",
        "Sec-Fetch-Site": "none",
        "Sec-Fetch-User": "?1",
        "Upgrade-Insecure-Requests": "1",
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36",
        "sec-ch-ua-mobile": "?0"
    }
    cookies = {
        "JSESSIONID": "48304F9E8D55A9F2F8ACC14B7EC5A02D",
        "wbf__voiceplg-is": "false",
        "tongue": "1",
        "Hm_lvt_c31f95cd1f1c01c74f44d211939ceb8c": "1712659199",
        "__gads": "ID",
        "__gpi": "UID",
        "__eoi": "ID",
        "Hm_lpvt_c31f95cd1f1c01c74f44d211939ceb8c": "1712667896",
        "SECKEY_ABVK": "2DPSFBW+PxohRgE9br/PahPpT7wKZzGpOzUoWKrHE14%3D",
        "BMAP_SECKEY": "XCSGTS0HVG9MJBd7qjmcuNCVpgwunmx3HOykd-nz4D-iFhbshz31f4mcmp3_W2DuaWoxnWstpA8--nKAgM_oHpmeq9I4YTbb3qlNksDhm1p8aAgMLY_JmRsPghK_5Cz-OHHnXHh16-fsX6GY9TW5yRhSOnFDrBnVc4V5LysnCzkEjrJ4OArZaTA6rA9Gid8tLBOeKUHh-nAGPdfN_KgAnw"
    }
    response = requests.get(one_url, headers=headers, cookies=cookies).content.decode()
    # print(response)
    parse_one(response, tynm)

def parse_one(response, tynm):
    # 解析某一条线路
    A = etree.HTML(response)
    # 线路名称
    xlmc = A.xpath('//h1[@class="title"]/span/text()')
    xlmc = ''.join(xlmc)
    # 线路类型
    xllx = A.xpath('//h1[@class="title"]/a/text()')
    xllx = ''.join(xllx)[1:-1]
    # 运行时间
    yxsj = A.xpath('//ul[@class="bus-desc"]/li[1]/text()')
    yxsj = ''.join(yxsj).split('时间：')[-1]
    # 参考票价
    ckpj = A.xpath('//ul[@class="bus-desc"]/li[2]/text()')
    ckpj = ''.join(ckpj).split('票价：')[-1]
    # 公交公司
    gjgs = A.xpath('//ul[@class="bus-desc"]/li[3]/a/text()')
    gjgs = ''.join(gjgs)
    # 最后更新
    zhgx = A.xpath('//ul[@class="bus-desc"]/li[4]/span/text()')
    zhgx = ''.join(zhgx).split('更新：')[-1]
    # 公交路线-正向
    gjxl_zx = A.xpath('//div[@class="service-area"]/div[@class="bus-lzlist mb15"][1]/ol/li/a/text()')
    gjxl_zx = '/'.join(gjxl_zx)
    # 公交路线-反向
    gjxl_fx = A.xpath('//div[@class="service-area"]/div[@class="bus-lzlist mb15"][2]/ol/li/a/text()')
    gjxl_fx = '/'.join(gjxl_fx)
    data = {
        tynm: [xlmc, xllx, yxsj, ckpj, gjgs, zhgx, gjxl_zx, gjxl_fx]
    }
    save_data(data, tynm, xlmc)

def save_data(data, tynm, xlmc):
    # 保存数据
    sheet_name = tynm
    if not os.path.exists(r'公交线路数据.xls'):
        wb = xlwt.Workbook(encoding='utf-8')
        sheet = wb.add_sheet(sheet_name, cell_overwrite_ok=True)
        header = ('线路名称', '线路类型', '运行时间', '参考票价', '公交公司', '最后更新', '公交路线-正向', '公交路线-反向')
        for i in range(0, len(header)):
            sheet.col(i).width = 2560 * 3
            sheet.write(0, i, header[i])
        wb.save(r'./公交线路数据.xls')
    wb = xlrd.open_workbook(r'公交线路数据.xls')
    sheets_list = wb.sheet_names()
    if sheet_name not in sheets_list:
        work = copy(wb)
        sh = work.add_sheet(sheet_name)
        header_new = ('线路名称', '线路类型', '运行时间', '参考票价', '公交公司', '最后更新', '公交路线-正向', '公交路线-反向')
        for index in range(0, len(header_new)):
            sh.col(index).width = 2560 * 3
            sh.write(0, index, header_new[index])
        work.save(r'./公交线路数据.xls')
    if os.path.exists(r'公交线路数据.xls'):
        wb = xlrd.open_workbook(r'公交线路数据.xls')
        sheets = wb.sheet_names()
        for i in range(len(sheets)):
            for name in data.keys():
                worksheet = wb.sheet_by_name(sheets[i])
                if worksheet.name == name:
                    rows_old = worksheet.nrows
                    new_workbook = copy(wb)
                    new_worksheet = new_workbook.get_sheet(i)
                    for num in range(0, len(data[name])):
                        new_worksheet.write(rows_old, num, data[name][num])
                    new_workbook.save(r'./公交线路数据.xls')
    print(r'***ok: 公交线路数据: {} - {}'.format(tynm, xlmc))


if __name__ == '__main__':
    get_all()y