python3通過selenium爬蟲獲取到dj商品的實例代碼

更新時間：2019年04月25日 09:08:05 作者：朱春雨

這篇文章主要介紹了python3通過selenium爬蟲獲取到dj商品的實例代碼,需要的朋友可以參考下

先給大家介紹下python3 selenium使用

其實這個就相當于模擬人的點擊事件來連續(xù)的訪問瀏覽器。如果你玩過王者榮耀的話在2016年一月份的版本里面就有一個bug。

安卓手機下載一個按鍵精靈就可以在冒險模式里面設置按鍵，讓手機自動玩闖關，一局19個金幣，一晚上就一個英雄了。不過

程序員也不是吃素的。給一個星期設置了大概4000金幣上限。有興趣的可以去試試。（注：手機需要root）

進入正題：

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait

在寫之前需要下載selenium模塊

brguge=webdriver.Chrome()#聲明驅(qū)動對象
try:
  brguge.get('https://www.baidu.com')#發(fā)送get請求
  input=brguge.find_element_by_id('kw')#找到目標

  input.send_keys('python')#輸入python關鍵字
  input.send_keys(Keys.ENTER)#敲入回車
  wait=WebDriverWait(brguge,10)#等待元素加載出來
  wait.until(EC.presence_of_element_located(By.ID,'content_left'))#加載
  print(brguge.current_url)#輸出搜索的路徑
  print(brguge.get_cookie())#輸出cookie
  print(brguge.page_source)#輸出結(jié)果源代碼
finally:
  brguge.close()#關閉谷歌瀏覽器

下面是一些selenium模塊的基本用法

查找元素

單個元素

(from selenium import webdriver)


    brguge.find_element_by_id('q')用這個元素找id是q的元素
    brguge.find_element_by_css_selector('#q')找css樣式是q的
    brguge.find_element_by_xpath('//*[ @id="q"]')三個效果一樣
    brguge.find_element_by_name()通過name來查找
    brguge.find_element_by_link_text()通過link來查找
    brguge.find_element_by_partial_link_text()
    brguge.find_element_by_tag_name()
    brguge.find_element_by_class_name()通過class查找
    
    from selenium import webdriver
    from selenium.webdriver.common.by import By
    brguge.find_element(By.ID,'Q')通用查找方式

    多個元素（find_elements）加了個s
        他會以列表的形式打印出來
        brguge.find_elements_by_css_selector('.service-bd li')css樣式為li的元素
        brguge.find_elements（By.css_selector,'.service-bd li'）兩個作用一樣
        (利用索引就可以獲取單個或多個元素了)
    元素交互操作（獲取元素然后再給他指令）
        選擇輸入框 --》send_keys('輸入文字')--》clear()清空輸入框--在輸入別的--》找到搜索--》click(點擊)
        input.clear()清空按鈕
    交互動作（將動作附加到動作鏈中串行執(zhí)行）
        switch_to_frame('iframeResult')
        用css樣式分別找到兩個要交互
        調(diào)用ActionChains(調(diào)用谷歌的)
        drag_and_drop(source,target)第一個到第二個上面
        perform()

下面看下python3通過selenium爬蟲獲取到dj商品的實例代碼。

具體代碼如下所示：

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import NoSuchElementException
from lxml import etree
import time, json
JD_URL_Login = "https://www.jd.com/"
class CustomizeException(Exception):
  def __init__(self, status, msg):
    self.status = status
    self.msg = msg
class JD:
  def __init__(self):
    self.browser = None
    self.__init_browser()
  def __init_browser(self):
    options = Options()
    options.add_argument("--headless")
    options.add_experimental_option('excludeSwitches', ['enable-automation'])
    # 設置為無圖模式
    options.add_experimental_option("prefs", {"profile.managed_default_content_settings.images": 2})
    self.browser = webdriver.Chrome(options=options)
    # 設置瀏覽器最大化窗口
    self.browser.maximize_window()
    # 隱式等待時間為3s
    self.browser.implicitly_wait(3)
    self.browser.get(JD_URL_Login)
    self.wait = WebDriverWait(self.browser, 10)
  def __search_goods(self, goods):
    '''搜索商品的方法'''
    self.file = open("jd-{}.json".format(goods), "a", encoding="utf-8")
    self.wait.until(EC.presence_of_all_elements_located((By.ID, "key")))
    serach_input = self.browser.find_element_by_id("key")
    serach_input.clear()
    serach_input.send_keys(goods, Keys.ENTER)
  def __get_goods_info(self, page_source):
    '''從網(wǎng)頁源碼中獲取到想要的數(shù)據(jù)'''
    selector_html = etree.HTML(page_source)
    # 商品名字 不要獲取title屬性，以后再改吧，最好是獲取到商品名的文本內(nèi)容
    goods_name = selector_html.xpath("http://div[@class='gl-i-wrap']//div[contains(@class,'p-name')]/a/@title")
    # 商品價格
    goods_price = selector_html.xpath("http://div[@class='gl-i-wrap']//div[@class='p-price']/strong/i/text()")
    # 商品評價數(shù)量
    comment_num_selector = selector_html.xpath("http://div[@class='p-commit']/strong")
    comment_num = [selector.xpath("string(.)") for selector in comment_num_selector]
    # 商品店鋪
    shop_name = selector_html.xpath("http://a[@class='curr-shop']/text()")
    goods_zip = zip(goods_name, goods_price, comment_num, shop_name)
    for goods_info in goods_zip:
      dic = {}
      dic["goods_name"] = goods_info[0]
      dic["goods_price"] = goods_info[1]
      dic["comment_num"] = goods_info[2]
      dic["shop_name"] = goods_info[3]
      # print("商品名字>>:", goods_info[0])
      # print("商品價格>>:", goods_info[1])
      # print("商品評價數(shù)量>>:", goods_info[2])
      # print("商品店鋪>>:", goods_info[3])
      # print("*" * 100)
      yield dic
  def __swipe_page(self):
    '''上下滑動頁面，將完整的網(wǎng)頁源碼返回'''
    height = self.browser.execute_script("return document.body.scrollHeight;")
    js = "window.scrollTo(0, {});".format(height)
    self.browser.execute_script(js)
    while True:
      time.sleep(1)
      now_height = self.browser.execute_script("return document.body.scrollHeight;")
      if height == now_height:
        return self.browser.page_source
      js = "window.scrollTo({}, {});".format(height, now_height)
      self.browser.execute_script(js)
      height = now_height
  def __is_element_exists(self, xpath):
    '''檢測一個xpath是否能夠找到'''
    try:
      self.browser.find_element_by_xpath(xpath=xpath)
      return True
    except NoSuchElementException:
      return False
  def __click_next_page(self):
    '''點擊下一頁，實現(xiàn)翻頁功能'''
    self.wait.until(EC.presence_of_all_elements_located((By.CLASS_NAME, "pn-next")))
    xpath = "http://a[@class='pn-next']"
    if not self.__is_element_exists(xpath):
      raise CustomizeException(10000, "該商品訪問完畢")
    self.browser.find_element_by_xpath(xpath).click()
  def __write_to_json(self, dic: dict):
    data_json = json.dumps(dic, ensure_ascii=False)
    self.file.write(data_json + "\n")
  def run(self, goods):
    self.__search_goods(goods)
    n = 1
    while True:
      print("正在爬取商品 <{}>---第{}頁......".format(goods, n))
      time.sleep(3)
      html = self.__swipe_page()
      for dic in self.__get_goods_info(html):
        self.__write_to_json(dic)
      try:
        self.__click_next_page()
      except CustomizeException:
        try:
          goods = goods_list.pop(0)
          self.run(goods)
        except IndexError:
          return
      n += 1
  def __del__(self):
    self.browser.close()
    self.file.close()
if __name__ == '__main__':
  jd = JD()
  goods_list = ["純牛奶", "酸奶", "奶茶", "床上用品", "電磁爐", "電視", "小米筆記本", "華碩筆記本", "聯(lián)想筆記本", "男士洗面奶", "女士洗面奶", "沐浴露", "洗發(fā)露",
         "牙刷", "牙膏", "拖鞋", "剃須刀", "水手服", "運動服", "紅龍果", "蘋果", "香蕉", "洗衣液", "電飯煲"]
  try:
    goods = goods_list.pop(0)
  except IndexError:
    raise CustomizeException(20000, "goods_list不能為空")
  try:
    jd.run(goods)
  finally:
    del jd

總結(jié)

以上所述是小編給大家介紹的python3通過selenium爬蟲獲取到dj商品的實例代碼,希望對大家有所幫助，如果大家有任何疑問請給我留言，小編會及時回復大家的。在此也非常感謝大家對腳本之家網(wǎng)站的支持！
如果你覺得本文對你有幫助，歡迎轉(zhuǎn)載，煩請注明出處，謝謝！

您可能感興趣的文章: