Python爬虫案例七:抓取南京公交信息数据并将其保存成excel多表形式

news/2024/9/18 23:17:47 标签: python, 爬虫, excel

测试链接:

        https://nanjing.8684.cn/line4 

思路:先抓取某个类型下的某一条线路所有数据,然后实现批量,,列举出三个类型代表既可

源码:

from lxml import etree
from xlutils.copy import copy
import requests, os, xlrd, xlwt

def get_all():
    # 获取所有
    tynm_list = ['江南线路(1-399)', '江北线路(400-699)', '江宁线路(700-999)']
    tyid_list = [2, 3, 4]
    for tynm, tyid in zip(tynm_list, tyid_list):
        list_url = 'https://nanjing.8684.cn/line{}'.format(tyid)
        headers = {
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
            "Accept-Language": "zh-CN,zh;q=0.9",
            "Cache-Control": "no-cache",
            "Connection": "keep-alive",
            "Pragma": "no-cache",
            "Sec-Fetch-Dest": "document",
            "Sec-Fetch-Mode": "navigate",
            "Sec-Fetch-Site": "none",
            "Sec-Fetch-User": "?1",
            "Upgrade-Insecure-Requests": "1",
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36",
            "sec-ch-ua-mobile": "?0"
        }
        cookies = {
            "JSESSIONID": "48304F9E8D55A9F2F8ACC14B7EC5A02D",
            "wbf__voiceplg-is": "false",
            "tongue": "1",
            "Hm_lvt_c31f95cd1f1c01c74f44d211939ceb8c": "1712659199",
            "__gads": "ID",
            "__gpi": "UID",
            "__eoi": "ID",
            "SECKEY_ABVK": "2DPSFBW+PxohRgE9br/PasK97Oo+bbbcKQgQu9uxadc%3D",
            "BMAP_SECKEY": "XCSGTS0HVG9MJBd7qjmcuIneKymhvMCOXLg1JoYhcHTYNyZi_ZD1PkQ8wHX0_ycxbyF1QTuQWF68O-J3hMNYeSVrLdplIVuNxTyW1OaKt18bXNTDHrBSmsZ7DEMwNaY3o1qfZ-Gy932UGgUlRkYaQLMujMyT2eGMlHUKElpXgb3WIdgV2i4dGkFfMutvbhUKyxkjaWZMOhimPI5uGe2Zow",
            "Hm_lpvt_c31f95cd1f1c01c74f44d211939ceb8c": "1712671763"
        }
        response = requests.get(list_url, headers=headers, cookies=cookies).content.decode()
        # print(response)
        parse_all(response, tynm)

def parse_all(response, tynm):
    # 解析所有的线路ID
    A = etree.HTML(response)
    a_list = A.xpath('//div[@class="list clearfix"]/a')
    for a in a_list:
        xlid = a.xpath('./@href')[0]
        get_one(xlid, tynm)

def get_one(xlid, tynm):
    # 某一条线路
    one_url = 'https://nanjing.8684.cn{}'.format(xlid)
    headers = {
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
        "Accept-Language": "zh-CN,zh;q=0.9",
        "Cache-Control": "no-cache",
        "Connection": "keep-alive",
        "Pragma": "no-cache",
        "Sec-Fetch-Dest": "document",
        "Sec-Fetch-Mode": "navigate",
        "Sec-Fetch-Site": "none",
        "Sec-Fetch-User": "?1",
        "Upgrade-Insecure-Requests": "1",
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36",
        "sec-ch-ua-mobile": "?0"
    }
    cookies = {
        "JSESSIONID": "48304F9E8D55A9F2F8ACC14B7EC5A02D",
        "wbf__voiceplg-is": "false",
        "tongue": "1",
        "Hm_lvt_c31f95cd1f1c01c74f44d211939ceb8c": "1712659199",
        "__gads": "ID",
        "__gpi": "UID",
        "__eoi": "ID",
        "Hm_lpvt_c31f95cd1f1c01c74f44d211939ceb8c": "1712667896",
        "SECKEY_ABVK": "2DPSFBW+PxohRgE9br/PahPpT7wKZzGpOzUoWKrHE14%3D",
        "BMAP_SECKEY": "XCSGTS0HVG9MJBd7qjmcuNCVpgwunmx3HOykd-nz4D-iFhbshz31f4mcmp3_W2DuaWoxnWstpA8--nKAgM_oHpmeq9I4YTbb3qlNksDhm1p8aAgMLY_JmRsPghK_5Cz-OHHnXHh16-fsX6GY9TW5yRhSOnFDrBnVc4V5LysnCzkEjrJ4OArZaTA6rA9Gid8tLBOeKUHh-nAGPdfN_KgAnw"
    }
    response = requests.get(one_url, headers=headers, cookies=cookies).content.decode()
    # print(response)
    parse_one(response, tynm)

def parse_one(response, tynm):
    # 解析某一条线路
    A = etree.HTML(response)
    # 线路名称
    xlmc = A.xpath('//h1[@class="title"]/span/text()')
    xlmc = ''.join(xlmc)
    # 线路类型
    xllx = A.xpath('//h1[@class="title"]/a/text()')
    xllx = ''.join(xllx)[1:-1]
    # 运行时间
    yxsj = A.xpath('//ul[@class="bus-desc"]/li[1]/text()')
    yxsj = ''.join(yxsj).split('时间:')[-1]
    # 参考票价
    ckpj = A.xpath('//ul[@class="bus-desc"]/li[2]/text()')
    ckpj = ''.join(ckpj).split('票价:')[-1]
    # 公交公司
    gjgs = A.xpath('//ul[@class="bus-desc"]/li[3]/a/text()')
    gjgs = ''.join(gjgs)
    # 最后更新
    zhgx = A.xpath('//ul[@class="bus-desc"]/li[4]/span/text()')
    zhgx = ''.join(zhgx).split('更新:')[-1]
    # 公交路线-正向
    gjxl_zx = A.xpath('//div[@class="service-area"]/div[@class="bus-lzlist mb15"][1]/ol/li/a/text()')
    gjxl_zx = '/'.join(gjxl_zx)
    # 公交路线-反向
    gjxl_fx = A.xpath('//div[@class="service-area"]/div[@class="bus-lzlist mb15"][2]/ol/li/a/text()')
    gjxl_fx = '/'.join(gjxl_fx)
    data = {
        tynm: [xlmc, xllx, yxsj, ckpj, gjgs, zhgx, gjxl_zx, gjxl_fx]
    }
    save_data(data, tynm, xlmc)

def save_data(data, tynm, xlmc):
    # 保存数据
    sheet_name = tynm
    if not os.path.exists(r'公交线路数据.xls'):
        wb = xlwt.Workbook(encoding='utf-8')
        sheet = wb.add_sheet(sheet_name, cell_overwrite_ok=True)
        header = ('线路名称', '线路类型', '运行时间', '参考票价', '公交公司', '最后更新', '公交路线-正向', '公交路线-反向')
        for i in range(0, len(header)):
            sheet.col(i).width = 2560 * 3
            sheet.write(0, i, header[i])
        wb.save(r'./公交线路数据.xls')
    wb = xlrd.open_workbook(r'公交线路数据.xls')
    sheets_list = wb.sheet_names()
    if sheet_name not in sheets_list:
        work = copy(wb)
        sh = work.add_sheet(sheet_name)
        header_new = ('线路名称', '线路类型', '运行时间', '参考票价', '公交公司', '最后更新', '公交路线-正向', '公交路线-反向')
        for index in range(0, len(header_new)):
            sh.col(index).width = 2560 * 3
            sh.write(0, index, header_new[index])
        work.save(r'./公交线路数据.xls')
    if os.path.exists(r'公交线路数据.xls'):
        wb = xlrd.open_workbook(r'公交线路数据.xls')
        sheets = wb.sheet_names()
        for i in range(len(sheets)):
            for name in data.keys():
                worksheet = wb.sheet_by_name(sheets[i])
                if worksheet.name == name:
                    rows_old = worksheet.nrows
                    new_workbook = copy(wb)
                    new_worksheet = new_workbook.get_sheet(i)
                    for num in range(0, len(data[name])):
                        new_worksheet.write(rows_old, num, data[name][num])
                    new_workbook.save(r'./公交线路数据.xls')
    print(r'***ok: 公交线路数据: {} - {}'.format(tynm, xlmc))


if __name__ == '__main__':
    get_all()y

运行效果:

1)running中:

2) ending:


http://www.niftyadmin.cn/n/5664684.html

相关文章

在Ubuntu中编译含有JSON的文件出现报错

在ubuntu中进行JSON相关学习的时候,我发现了一些小问题,决定与大家进行分享,减少踩坑时候出现不必要的时间耗费 截取部分含有JSON部分的代码进行展示 char *str "{ \"title\":\"JSON Example\", \"author\&…

QGis二次开发 —— 3、程序加载栅格tif与矢量shp文件可进行切换控制,可进行导出/导入工程(附源码)

效果 功能说明 软件可同时加载.tif栅格图片与.shp矢量图片、加载图片后可进行自由切换查看图层、可对加载的图片进行关闭 关闭后清空图层、可对加载的图片进行导出.qgs的QGIS工程、可对.qgs的QGis工程导入并导入后可进行自由切换查看图层。 源码 注意: 在加载tif栅格文件后会在…

基于SpringBoot+Vue的房屋租赁平台

作者:计算机学姐 开发技术:SpringBoot、SSM、Vue、MySQL、JSP、ElementUI、Python、小程序等,“文末源码”。 专栏推荐:前后端分离项目源码、SpringBoot项目源码、SSM项目源码 系统展示 【2025最新】基于JavaSpringBootVueMySQL的…

Mobile net V系列详解 理论+实战(2)

Mobilenet 系列 实践部分一、数据集介绍二、模型整体框架三、模型代码详解四、总结 实践部分 本章针对实践通过使用pytorch一个实例对这部分内容进行吸收分析。本章节采用的源代码在这里感兴趣的读者可以自行下载操作。 一、数据集介绍 可以看到数据集本身被存放在了三个文件…

Day02Day03

1. 为什么拦截器不会去拦截/admin/login上,是因为在SpringMvc中清除了这种可能。 2.使用自己定义注解,实现AOP(insert ,update) 3.使用update最好使用动态语句,可以使用多次 4.使用阿里云的OSS存储。用common类 5.在写…

动手学深度学习(pytorch)学习记录28-使用块的网络(VGG)[学习记录]

目录 VGG块VGG网络训练模型 VGG块 定义了一个名为vgg_block的函数来实现一个VGG块 import torch from torch import nn from d2l import torch as d2ldef vgg_block(num_convs, in_channels, out_channels):layers []for _ in range(num_convs):layers.append(nn.Conv2d(in_…

Go 1.19.4 路径和目录-Day 15

1. 路径介绍 存储设备保存着数据,但是得有一种方便的模式让用户可以定位资源位置,操作系统采用一种路径字符 串的表达方式,这是一棵倒置的层级目录树,从根开始。 相对路径:不是以根目录开始的路径,例如 a/b…

ant-design表格自动合并相同内容的单元格

表格自动合并相同内容的单元格 合并hooks import { TableColumnProps } from antdexport const useAutoMergeTableCell <T extends object>(dataSource: Array<T>,columns: Array<TableColumnProps> | Array<keyof T> ): Map<keyof T, Array<…