csv 存储
import requests import re import json from scrapy.exporters import CsvItemExporter from concurrent.futures import ThreadPoolExecutor, as_completed, ProcessPoolExecutor class TBSpider(object): executor = ThreadPoolExecutor(max_workers=8) def __init__(self): # 用户输入检索商品名称 self.user_input = '香水' # 请求头 self.headers = { } # 文件存储初始化操作 self.file = open(f'{self.user_input}.xls', 'wb') self.exporter = CsvItemExporter(file=self.file, include_headers_line=False, encoding='gbk') self.exporter.start_exporting() """发送请求,获取响应""" def parse_start_url(self): all_tasks = [] for i in range(1, 100): start_url = f'https://s.taobao.com/search?spm=a21bo.jianhua.201867-main.5.1c1611d9bwOdR9&q={self.user_input}&imgfile=&commend=all&ssid=s5-e&search_type=item&sourceId=tb.index&ie=utf8&initiative_id=tbindexz_20170306&bcoffset=-8&ntoffset=-8&p4ppushleft=2%2C48&s={i*44}' try: # 发起请求 response = requests.get(url=start_url, headers=self.headers).content.decode('utf-8') task = self.executor.submit(self.parse_start_url, i, 2) all_tasks.append(task) for result in as_completed(all_tasks): exception = result.exception() if exception: self.parse(response) except BaseException as e: print(e) def parse(self, response): global num data = re.findall('g_page_config = (.*?)g_srp_loadCss', response, re.S)[0] if data: data = data[0:-6] json_dict = json.loads(data) li = json_dict['mods']['itemlist']['data']['auctions'] if li: for i in li: # 标题 raw_title = i['raw_title'] # 价格 view_price = i['view_price'] # 发货地 item_loc = i['item_loc'] # 付款人数 comment_count = i['comment_count'] # 店铺名 nick = i['nick'] # 店铺链接 shop_link = i['shopLink'] if 'https:' not in shop_link: shop_link = 'https:' + shop_link # 数据封装 detail_data = { 'num': num, 'raw_title': raw_title, 'view_price': view_price, 'item_loc': item_loc, 'comment_count': comment_count, 'nick': nick, 'shopLink': shop_link, } num += 1 print(detail_data) self.exporter.export_item(detail_data) def __del__(self): self.exporter.finish_exporting() self.file.close() if __name__ == '__main__': # 定义计数 num = 1 s = TBSpider() s.parse_start_url()
原文地址:http://www.cnblogs.com/modly/p/16907576.html
1. 本站所有资源来源于用户上传和网络,如有侵权请邮件联系站长!
2. 分享目的仅供大家学习和交流,请务用于商业用途!
3. 如果你也有好源码或者教程,可以到用户中心发布,分享有积分奖励和额外收入!
4. 本站提供的源码、模板、插件等等其他资源,都不包含技术服务请大家谅解!
5. 如有链接无法下载、失效或广告,请联系管理员处理!
6. 本站资源售价只是赞助,收取费用仅维持本站的日常运营所需!
7. 如遇到加密压缩包,默认解压密码为"gltf",如遇到无法解压的请联系管理员!
8. 因为资源和程序源码均为可复制品,所以不支持任何理由的退款兑现,请斟酌后支付下载
声明:如果标题没有注明"已测试"或者"测试可用"等字样的资源源码均未经过站长测试.特别注意没有标注的源码不保证任何可用性