乡下人产国偷v产偷v自拍,国产午夜片在线观看,婷婷成人亚洲综合国产麻豆,久久综合给合久久狠狠狠9

  • <output id="e9wm2"></output>
    <s id="e9wm2"><nobr id="e9wm2"><ins id="e9wm2"></ins></nobr></s>

    • 分享

      Scrapy爬取?;ňW(wǎng)圖片

       小豬窩969 2019-01-03
      #爬蟲內(nèi)容
      from scrapy.selector import Selector
      from scrapy.http import Request
      from ..items import MysiteItem
      #from scrapy.dupefilter import RFPDupeFilter
      class XiaohuaSpider(scrapy.Spider):
      name = 'xiaohua1'
      allowed_domains = ["wx.dxs6.cn","www.","www.dxsabc.com"]
      start_urls = ['http://www./hua/']
      #page_set = set()
      def parse(self, response):
      item = MysiteItem()
      info_list = Selector(response=response).\
      xpath('//div[starts-with(@class,"item_list")]/div')
      for obj in info_list:
      name = obj.xpath(".//a/img/@alt").extract_first()
      img = obj.xpath(".//a/img/@src").extract_first()
      img_request = response.urljoin(img) #response.urljoin -->根據(jù)start_url自動補全url
      	    #例如 img 抓取到的鏈接為 update/18883004004.jpg  就會自動補全成http://www./update/18883004004.jpg,如果是完成的http請求,則忽略----->follow 一樣
      item["url_address"] = img_request
      item["name"] = name
      	   #交給pipelines做數(shù)據(jù)持久化
      yield item
      	    #yield response.follow(img,callback=parse)#對相對路徑進行自動補全



      #獲取?;ňW(wǎng)深度
      page_num = Selector(response = response).xpath('//*[@id="page"]/div/a/@href').extract()

      for url in page_num:

      # if url in self.page_set:
      # pass
      # #print(u"url已經(jīng)存在")
      # else:
      # self.page_set.add(url)
      #Request scrapy引擎交給調(diào)度器繼續(xù)執(zhí)行
      yield Request(url = url,callback = self.parse)

      # def parse_datile(self,response):
      #
      # print("request------>", response.url)
      # info_list = Selector(response=response). \
      # xpath('//div[starts-with(@class,"item_list")]/div')
      # for obj in info_list:
      # name = obj.xpath(".//a/img/@alt").extract_first()
      # img = obj.xpath(".//a/img/@src").extract_first()
      #
      # for url in self.img_url:
      # #img_request = response.follow(url, callback=self.parse)
      # img_request = response.urljoin(url)
      # yield Request(url=img_request, callback=self.parse)
      # item = MysiteItem()
      # item["url_address"] = img_request
      # yield item
      #item 內(nèi)容 ----格式化數(shù)據(jù)

      import scrapy


      class MysiteItem(scrapy.Item):
      # define the fields for your item here like:
      name = scrapy.Field()
      url_address = scrapy.Field()
      #pipelines內(nèi)容------》數(shù)據(jù)持久化(可以寫多個class 在settings中配置權(quán)重,數(shù)字小的先執(zhí)行)
      # -*- coding: utf-8 -*-

      # Define your item pipelines here
      #
      # Don't forget to add your pipeline to the ITEM_PIPELINES setting
      # See: https://doc./en/latest/topics/item-pipeline.html

      import sys,os
      import codecs
      import json
      import requests
      from scrapy.http import Request

      class MyPipeline(object):
      def __init__(self,picture):
      #self.picture = os.path.join(os.path.dirname(os.path.abspath(__file__)), "img_picture")
      self.picture = picture
      #處理爬取的數(shù)據(jù)
      def process_item(self, item, spider):

      response = requests.get(item["url_address"])

      picture_img = self.picture + "\\" + item["name"] + ".jpg"
      with open(picture_img,"wb") as f_write:
      f_write.write(response.content)
      #爬取開始前執(zhí)行
      def open_spider(self,spider):
      print("開始爬取。。。。。。。。。。。。。。。。。。。。。")
      if not os.path.exists(self.picture):
      os.mkdir(self.picture)
      #爬取結(jié)束后執(zhí)行
      def close_spider(self,spider):
      print("結(jié)束爬取")
      @classmethod
      def from_crawler(cls,crawler):
      picture = crawler.settings.get("IMG_PICTURE") #獲取settings文件 自定義的數(shù)據(jù)
      return cls(picture) #cls是類名,實例化MyPipeline

        本站是提供個人知識管理的網(wǎng)絡(luò)存儲空間,所有內(nèi)容均由用戶發(fā)布,不代表本站觀點。請注意甄別內(nèi)容中的聯(lián)系方式、誘導(dǎo)購買等信息,謹防詐騙。如發(fā)現(xiàn)有害或侵權(quán)內(nèi)容,請點擊一鍵舉報。
        轉(zhuǎn)藏 分享 獻花(0

        0條評論

        發(fā)表

        請遵守用戶 評論公約

        類似文章 更多