csv 存储

import requests
import re
import json
from scrapy.exporters import CsvItemExporter
from concurrent.futures import ThreadPoolExecutor, as_completed, ProcessPoolExecutor


class TBSpider(object):
    executor = ThreadPoolExecutor(max_workers=8)

    def __init__(self):
        # 用户输入检索商品名称
        self.user_input = '香水'

        # 请求头
        self.headers = {

        }

        # 文件存储初始化操作
        self.file = open(f'{self.user_input}.xls', 'wb')
        self.exporter = CsvItemExporter(file=self.file, include_headers_line=False, encoding='gbk')
        self.exporter.start_exporting()

    """发送请求,获取响应"""
    def parse_start_url(self):
        all_tasks = []
        for i in range(1, 100):
            start_url = f'https://s.taobao.com/search?spm=a21bo.jianhua.201867-main.5.1c1611d9bwOdR9&q={self.user_input}&imgfile=&commend=all&ssid=s5-e&search_type=item&sourceId=tb.index&ie=utf8&initiative_id=tbindexz_20170306&bcoffset=-8&ntoffset=-8&p4ppushleft=2%2C48&s={i*44}'
            try:
                # 发起请求
                response = requests.get(url=start_url, headers=self.headers).content.decode('utf-8')
                task = self.executor.submit(self.parse_start_url, i, 2)
                all_tasks.append(task)

                for result in as_completed(all_tasks):
                    exception = result.exception()
                    if exception:

                        self.parse(response)
            except BaseException as e:
                print(e)

    def parse(self, response):
        global num
        data = re.findall('g_page_config = (.*?)g_srp_loadCss', response, re.S)[0]
        if data:
            data = data[0:-6]
            json_dict = json.loads(data)

            li = json_dict['mods']['itemlist']['data']['auctions']
            if li:
                for i in li:
                    # 标题
                    raw_title = i['raw_title']
                    # 价格
                    view_price = i['view_price']
                    # 发货地
                    item_loc = i['item_loc']
                    # 付款人数
                    comment_count = i['comment_count']
                    # 店铺名
                    nick = i['nick']
                    # 店铺链接
                    shop_link = i['shopLink']
                    if 'https:' not in shop_link:
                        shop_link = 'https:' + shop_link

                    # 数据封装
                    detail_data = {
                        'num': num,
                        'raw_title': raw_title,
                        'view_price': view_price,
                        'item_loc': item_loc,
                        'comment_count': comment_count,
                        'nick': nick,
                        'shopLink': shop_link,
                    }
                    num += 1
                    print(detail_data)

                    self.exporter.export_item(detail_data)

    def __del__(self):
        self.exporter.finish_exporting()
        self.file.close()


if __name__ == '__main__':
    # 定义计数
    num = 1
    s = TBSpider()
    s.parse_start_url()

 

原文地址:http://www.cnblogs.com/modly/p/16907576.html

1. 本站所有资源来源于用户上传和网络,如有侵权请邮件联系站长! 2. 分享目的仅供大家学习和交流,请务用于商业用途! 3. 如果你也有好源码或者教程,可以到用户中心发布,分享有积分奖励和额外收入! 4. 本站提供的源码、模板、插件等等其他资源,都不包含技术服务请大家谅解! 5. 如有链接无法下载、失效或广告,请联系管理员处理! 6. 本站资源售价只是赞助,收取费用仅维持本站的日常运营所需! 7. 如遇到加密压缩包,默认解压密码为"gltf",如遇到无法解压的请联系管理员! 8. 因为资源和程序源码均为可复制品,所以不支持任何理由的退款兑现,请斟酌后支付下载 声明:如果标题没有注明"已测试"或者"测试可用"等字样的资源源码均未经过站长测试.特别注意没有标注的源码不保证任何可用性