快捷導(dǎo)航

python按綜合、銷量排序抓取100頁的淘寶商品列表信息

更新時間：2018年02月24日 14:15:15 作者：sisteryaya

這篇文章主要為大家詳細介紹了python按綜合、銷量排序抓取100頁的淘寶商品列表信息，具有一定的參考價值，感興趣的小伙伴們可以參考一下

進入淘寶網(wǎng)，分別按綜合、銷量排序抓取100頁的所有商品的列表信息。

1、按綜合

import re 
from selenium import webdriver 
from selenium.common.exceptions import TimeoutException 
from selenium.webdriver.common.by import By 
from selenium.webdriver.support.ui import WebDriverWait 
from selenium.webdriver.support import expected_conditions as EC 
from pyquery import PyQuery as pq #獲取整個網(wǎng)頁的源代碼 
 
from config import * #可引用congif的所有變量 
import pymongo 
import pymysql 
 
# client=pymongo.MongoClient(MONGO_URL) 
# db = client[MONGO_DB] 
 
# 按綜合排序 100頁 
 
 
# 打開淘寶鏈接，輸入‘美食'，搜索 
# 自動翻頁：先得到總頁數(shù)，再轉(zhuǎn)到 _ 頁，確定 
# 
 
# browser = webdriver.PhantomJS(service_args=SERVICE_ARGS) 
# browser =webdriver.Chrome() 
browser = webdriver.Firefox() 
wait = WebDriverWait(browser,10) 
 
def search(): 
 print('正在搜索...') 
 try: 
  browser.get('https://www.taobao.com') #用這個網(wǎng)頁'https://s.taobao.com'，無法輸入keywords 
  input=wait.until( 
    EC.presence_of_element_located((By.CSS_SELECTOR,'#q')) #打開淘寶，右擊查看元素，定位到搜索框，選擇對應(yīng)代碼，復(fù)制-CSS選擇器，其實就是‘#q'。 
  ) 
  submit=wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,'#J_TSearchForm > div.search-button > button'))) 
  input.send_keys(KEYWORD) #模擬操作，輸入內(nèi)容 
  submit.click() #點擊提交 
  total = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'#mainsrp-pager > div > div > div > div.total'))) #頁數(shù) 
  return total.text 
 except TimeoutException : 
  return search() 
 
# 翻頁 
def next_page(page_number): 
 print('正在翻頁',page_number) 
 try: 
  input = wait.until( 
   # 輸入框 
   EC.presence_of_element_located((By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > div.form > input')) # 打開淘寶，右擊查看元素，定位到搜索框，選擇對應(yīng)代碼，復(fù)制-CSS選擇器，其實就是‘#q'。 
  ) 
  # 搜索按鈕 
  submit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > div.form > span.btn.J_Submit'))) #未修改 
  input.clear() 
  input.send_keys(page_number) # 模擬操作，輸入頁碼 
  submit.click() 
  #判斷翻頁是否成功，找到高亮頁碼數(shù)，由數(shù)子判斷 
  wait.until(EC.text_to_be_present_in_element((By.CSS_SELECTOR ,'#mainsrp-pager > div > div > div > ul > li.item.active > span'), str(page_number))) 
  get_products() 
 except TimeoutException : 
  next_page(page_number) 
 
# 解析，獲取每頁的商品并輸出 
def get_products(): 
 wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR,'#mainsrp-itemlist .items .item'))) #加載所有寶貝 
 html=browser.page_source 
 doc = pq(html) 
 items = doc('#mainsrp-itemlist .items .item').items() 
 for item in items: 
  product = { 
   # 'picture':item.find('.pic .img').attr('src'),#用find去獲取內(nèi)部元素，選擇器是 pic，img，用attr獲取屬性 
   'image': item.find('.pic .img').attr('data-src'), # 用find去獲取內(nèi)部元素，選擇器是 pic，img，用attr獲取屬性 
   'shop_id': item.find('.shop').find('a').attr('data-userid'), # 店鋪 id 
   'data_id': item.find('.shop').find('a').attr('data-nid'), # 商品 id 
   'link': item.find('.pic-box-inner').find('.pic').find('a').attr['href'], 
   'price':item.find('.price').text()[1:-3], # 用text獲取內(nèi)容 
   'deal':item.find('.deal-cnt').text()[:-3], 
   'title':item.find('.title').text().replace(' ',''), 
   'shop':item.find('.shop').text(), 
   'location':item.find('.location').text() 
  } 
  # print(product) 
  # print(product['location']) 
  save_to_mysql(product) 
''''' 
def main(): 
 try: 
  # search() 
  total=search() # 此時 total = ‘共 100 頁，' 
  total=int(re.compile('(\d+)').search(total).group(1)) # 用正則表達式提取數(shù)字100 
  # print(total) 
  for i in range(2,total+1): 
   next_page(i) 
 except Exception: 
  print('出錯啦') 
 finally: # 不管有沒有異常，都要執(zhí)行此操作 
  browser.close() # 關(guān)瀏覽器 
''' 
 
def main(): 
 total=search() 
 total=int(re.compile('(\d+)').search(total).group(1)) 
 for i in range(2,total+1): 
  next_page(i)#顯示當(dāng)前爬取網(wǎng)頁的頁數(shù) 
  print ('搞定%d'%i) 
 
def save_to_mysql(product): 
 # print(product['location']) 
 #,use_unicode = False 
 try: 
  conn = pymysql.connect(host='localhost', user='root', passwd=' ', db='test1', port=3306,charset='utf8' ) 
  cur = conn.cursor() # 創(chuàng)建一個游標(biāo)對象 
  sql = """INSERT INTO women_clothes_zonghe VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s)""" 
  cur.execute(sql, (product['shop_id'],product['shop'], product['link'],product['data_id'], product['title'], product['price'], product['location'],product['deal'],product['image'])) 
  # cur.execute(sql) 
  print('- - - - - 數(shù)據(jù)保存成功 - - - - -') 
  cur.close() 
  conn.commit() 
  conn.close() # 關(guān)閉數(shù)據(jù) 
 except pymysql.Error as e: 
  print(e) 
 
if __name__=='__main__': 
 # 連接數(shù)據(jù)庫 
 conn = pymysql.connect(host='localhost', user='root', passwd=' ', db='test1', port=3306,charset="utf8") 
 cur = conn.cursor() # 創(chuàng)建一個游標(biāo)對象 
 cur.execute("DROP TABLE IF EXISTS women_clothes_zonghe") # 如果表存在則刪除 
 # 創(chuàng)建表sql語句 
 sqlc = """CREATE TABLE women_clothes_zonghe( 
  shop_id VARCHAR(500), 
  shop VARCHAR(500), 
  link VARCHAR(1000), 
  data_id varchar(100), 
  title VARCHAR(1000), 
  price VARCHAR(500), 
  location VARCHAR(500), 
  deal VARCHAR(500), 
  image VARCHAR(1000) 
 )""" 
 cur.execute(sqlc) # 執(zhí)行創(chuàng)建數(shù)據(jù)表操作 
 main()

2、按銷量

import re 
 
from bs4 import BeautifulSoup 
from pyquery import PyQuery as pq #獲取整個網(wǎng)頁的源代碼 
 
from config import * #可引用congif的所有變量 
import pymongo 
import pymysql 
 
import urllib 
import requests 
import json 
import bs4 
 
from selenium import webdriver 
from pyquery import PyQuery as pq #獲取整個網(wǎng)頁的源代碼 
 
# 完整爬取所有頁面的商品信息 共100頁 按銷量排序 
 
 
browser = webdriver.Firefox() 
wait = WebDriverWait(browser,10) 
 
def get_url(keyword): 
 url_str = urllib.parse.quote(keyword) 
 i = 0 
 for j in range(100): 
  yield{ 
   'url':('https://s.taobao.com/search?q={}&imgfile=&commend=all&ssid=s5-e&search_type=item&sourceId=tb.index&spm=' 
   'a21bo.50862.201856-taobao-item.1&ie=utf8&initiative_id=tbindexz_20170808&sort=sale-desc&bcoffset=0&p4ppushleft=%2C44&s={}').format(url_str,i) 
  } 
  i+=44 
 
# 可行 
def get_products(url): 
 browser.get(url) 
 wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR,'#mainsrp-itemlist .items .item'))) #加載所有寶貝 
 html=browser.page_source 
 doc = pq(html) 
 # print(doc) 
 items = doc('#mainsrp-itemlist .items .item').items() 
 for item in items: 
  product = { 
   # 獲取 image 時，用'src'總有部分圖片獲取不到，因為淘寶設(shè)有'data-src' 和'src'，不同商品這兩個屬性的前后順序不一樣，直接用'data-src'可避免返回 None 
   'image':item.find('.pic .img').attr('data-src'),#用find去獲取內(nèi)部元素，選擇器是 pic，img，用attr獲取屬性 
   # 'image':item.find('.pic-box-inner').find('.pic').find('img').attr['src'], 
   'price':item.find('.price').text()[1:-3], # 用text獲取內(nèi)容 
   'shop_id': item.find('.shop').find('a').attr('data-userid'), # 店鋪 id 
   'data_id': item.find('.shop').find('a').attr('data-nid'), # 商品 id 
   'link': item.find('.pic-box-inner').find('.pic').find('a').attr['href'], 
   'deal':item.find('.deal-cnt').text()[:-3], 
   'title':item.find('.title').text(), 
   'shop':item.find('.shop').text(), 
   'location':item.find('.location').text().replace(' ','') 
  } 
  # print(product) 
  save_to_mysql(product) 
 
def save_to_mysql(product): 
 try: 
  conn = pymysql.connect(host='localhost',user='root',passwd=' ',port=3306,db='test1',charset='utf8') 
  cur = conn.cursor() 
  sql = "insert into women_clothes_sales2 values (%s,%s,%s,%s,%s,%s,%s,%s,%s)" 
  cur.execute(sql,(product['shop_id'],product['shop'],product['link'],product['data_id'],product['title'],product['price'],product['location'],product['deal'],product['image'])) 
  print('- - - 數(shù)據(jù)保存成功 - - - ') 
  cur.close() 
  conn.commit() 
  conn.close() 
 except pymysql.Error as e: 
  print(e) 
 
def main(): 
 keyword = '女裝' 
 links = get_url(keyword) # 字典 
 # 獲取每頁的 url 
 for link in links: 
  # print(link) 
  url = link['url'] 
  #解析頁面 
  # soup = get_html(url) 
  # print(soup) 
  # get_detail(soup,url) 
  get_products(url) 
 
if __name__=='__main__': 
 conn = pymysql.connect(host='localhost',user = 'root',passwd=' ',db='test1',port = 3306,charset='utf8') 
 cur = conn.cursor() 
 cur.execute('Drop table if exists women_clothes_sales2') 
 sqlc = "create table women_clothes_sales2(shop_id varchar(100),shop varchar(500),link varchar(1000),data_id varchar(100),title varchar(500),price varchar(200),location varchar(100),deal varchar(100),image varchar(1000))" 
 cur.execute(sqlc) 
 cur.close() 
 conn.commit() 
 conn.close() 
 main()

更多內(nèi)容請參考專題《python爬取功能匯總》進行學(xué)習(xí)。

以上就是本文的全部內(nèi)容，希望對大家的學(xué)習(xí)有所幫助，也希望大家多多支持腳本之家。

您可能感興趣的文章: