Scrapy項目實戰(zhàn)之爬取某社區(qū)用戶詳情
更新時間:2020年09月17日 11:01:16 作者:hankleo
這篇文章主要介紹了Scrapy項目實戰(zhàn)之爬取某社區(qū)用戶詳情,文中通過示例代碼介紹的非常詳細,對大家的學習或者工作具有一定的參考學習價值,需要的朋友們下面隨著小編來一起學習學習吧
本文介紹了Scrapy項目實戰(zhàn)之爬取某社區(qū)用戶詳情,分享給大家,具有如下:
get_cookies.py
from selenium import webdriver
from pymongo import MongoClient
from scrapy.crawler import overridden_settings
# from segmentfault import settings
import time
import settings
class GetCookies(object):
def __init__(self):
# 初始化組件
# 設定webdriver選項
self.opt = webdriver.ChromeOptions()
# self.opt.add_argument("--headless")
# 初始化用戶列表
self.user_list = settings.USER_LIST
# 初始化MongoDB參數(shù)
self.client = MongoClient(settings.MONGO_URI)
self.db = self.client[settings.MONGO_DB]
self.collection = self.db["cookies"]
def get_cookies(self,username,password):
"""
:param username:
:param password:
:return: cookies
"""
# 使用webdriver選項創(chuàng)建driver
driver = webdriver.Chrome(executable_path="/Users/Hank/scrapy/segmentfault/segmentfault/chromedriver",options=self.opt)
driver.get("https://segmentfault.com/user/login")
driver.find_element_by_name("username").send_keys(username)
driver.find_element_by_name("password").send_keys(password)
driver.find_element_by_xpath("http://button[@type='submit']").click()
time.sleep(2)
driver.get("https://segmentfault.com/u/luwangmeilun/users/following")
# 登陸之后獲取頁面cookies
cookies = driver.get_cookies()
driver.quit()
return cookies
def format_cookies(self,cookies):
"""
:param cookies:
從driver.get_cookies的形式為:
[{'domain': 'segmentfault.com', 'httpOnly': False, 'name': 'PHPSESSID',
'path': '/', 'secure': False, 'value': 'web2~5grmfa89j12eksub8hja3bvaq4'},
{'domain': '.segmentfault.com', 'expiry': 1581602940, 'httpOnly': False,
'name': 'Hm_lvt_e23800c454aa573c0ccb16b52665ac26', 'path': '/', 'secure': False,
'value': '1550066940'},
{'domain': '.segmentfault.com', 'httpOnly': False,
'name': 'Hm_lpvt_e23800c454aa573c0ccb16b52665ac26',
'path': '/', 'secure': False, 'value': '1550066940'},
{'domain': '.segmentfault.com', 'expiry': 1550067000, 'httpOnly': False,
'name': '_gat', 'path': '/', 'secure': False, 'value': '1'},
{'domain': '.segmentfault.com', 'expiry': 1550153340, 'httpOnly': False,
'name': '_gid', 'path': '/', 'secure': False, 'value': 'GA1.2.783265084.1550066940'},
{'domain': '.segmentfault.com', 'expiry': 1613138940, 'httpOnly': False, 'name': '_ga',
'path': '/', 'secure': False, 'value': 'GA1.2.1119166665.1550066940'}]
只需提取每一項的name與value即可
:return:
"""
c = dict()
for item in cookies:
c[item['name']] = item['value']
return c
def save(self):
print("開始獲取Cookies....")
# 從用戶列表中獲取用戶名與密碼,分別登陸獲取cookies
for username,password in self.user_list:
cookies = self.get_cookies(username,password)
f_cookies = self.format_cookies(cookies)
print("insert cookie:{}".format(f_cookies))
# 將格式整理后的cookies插入MongoDB數(shù)據(jù)庫
self.collection.insert_one(f_cookies)
# s = db[self.collection].find()
# for i in s:
# print(i)
if __name__ == '__main__':
cookies = GetCookies()
for i in range(20):
cookies.save()
item.py
# -*- coding: utf-8 -*- # Define here the models for your scraped items # # See documentation in: # https://doc.scrapy.org/en/latest/topics/items.html import scrapy class SegmentfaultItem(scrapy.Item): # define the fields for your item here like: # 個人屬性 # 姓名 name = scrapy.Field() # 聲望 rank = scrapy.Field() # 學校 school = scrapy.Field() # 專業(yè) majors = scrapy.Field() # 公司 company = scrapy.Field() # 工作 job = scrapy.Field() # blog blog = scrapy.Field() # 社交活動數(shù)據(jù) # 關注人數(shù) following = scrapy.Field() # 粉絲數(shù) fans = scrapy.Field() # 回答數(shù) answers = scrapy.Field() # 提問數(shù) questions = scrapy.Field() # 文章數(shù) articles = scrapy.Field() # 講座數(shù) lives = scrapy.Field() # 徽章數(shù) badges = scrapy.Field() # 技能屬性 # 點贊數(shù) like = scrapy.Field() # 技能 skills = scrapy.Field() # 注冊日期 register_date = scrapy.Field() # 問答統(tǒng)計 # 回答最高得票數(shù) answers_top_score = scrapy.Field() # 得票數(shù)最高的回答對應的問題的標題 answers_top_title = scrapy.Field() # 得票數(shù)最高的回答對應的問題的標簽 answers_top_tags = scrapy.Field() # 得票數(shù)最高的回答對應的問題的內容 answers_top_question = scrapy.Field() # 得票數(shù)最高的回答對應的問題的內容 answers_top_content = scrapy.Field()
pipeline.py
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import pymongo
class SegmentfaultPipeline(object):
# 設定MongoDB集合名稱
collection_name = 'userinfo'
def __init__(self,mongo_uri,mongo_db):
self.mongo_uri = mongo_uri
self.mongo_db = mongo_db
# 通過crawler獲取settings.py中設定的MongoDB連接信息
@classmethod
def from_crawler(cls,crawler):
return cls(
mongo_uri = crawler.settings.get('MONGO_URI'),
mongo_db = crawler.settings.get('MONGO_DB','segmentfault')
)
# 當爬蟲啟動時連接MongoDB
def open_spider(self,spider):
self.client = pymongo.MongoClient(self.mongo_uri)
self.db = self.client[self.mongo_db]
# 當爬蟲關閉時斷開MongoDB連接
def close_spider(self,spider):
self.client.close()
# 將Item插入數(shù)據(jù)庫保存
def process_item(self, item, spider):
self.db[self.collection_name].insert_one(dict(item))
return item
settings.py
# -*- coding: utf-8 -*-
# Scrapy settings for segmentfault project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://doc.scrapy.org/en/latest/topics/settings.html
# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'segmentfault'
SPIDER_MODULES = ['segmentfault.spiders']
NEWSPIDER_MODULE = 'segmentfault.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
# Configure maximum concurrent requests performed by Scrapy (default: 16)
CONCURRENT_REQUESTS = 100
# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
# DOWNLOAD_DELAY = 2
# The download delay setting will honor only one of:
# CONCURRENT_REQUESTS_PER_DOMAIN = 32
# CONCURRENT_REQUESTS_PER_IP = 32
# Disable cookies (enabled by default)
# COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
RETRY_ENABLED = False
REDIRECT_ENABLED = False
DOWNLOAD_TIMEOUT = 5
# HTTPALLOW
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
SPIDER_MIDDLEWARES = {
'segmentfault.middlewares.SegmentfaultSpiderMiddleware': 543,
}
# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
DOWNLOADER_MIDDLEWARES = {
# 'segmentfault.middlewares.SegmentfaultHttpProxyMiddleware': 543,
'segmentfault.middlewares.SegmentfaultUserAgentMiddleware':643,
'segmentfault.middlewares.SegmentfaultCookiesMiddleware':743,
'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': None,
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None,
# 'scrapy.downloadermiddlewares.cookies.CookiesMiddleware':None,
}
# Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'segmentfault.pipelines.SegmentfaultPipeline': 300,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
# AUTOTHROTTLE_ENABLED = True
# # The initial download delay
# AUTOTHROTTLE_START_DELAY = 5
# # The maximum download delay to be set in case of high latencies
# AUTOTHROTTLE_MAX_DELAY = 60
# # The average number of requests Scrapy should be sending in parallel to
# # each remote server
# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# # Enable showing throttling stats for every response received:
# AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
# 配置MONGODB
MONGO_URI = 'localhost:27017'
MONGO_DB = 'segmentfault'
# 用戶列表
USER_LIST = [
("798549150@qq.com","guoqing1010"),
("learnscrapy@163.com","guoqing1010"),
]
# 配置代理列表
PROXY_LIST = [
'http://115.182.212.169:8080',
'http://121.61.25.149:9999',
'http://180.118.247.189:9000',
'http://115.151.3.12:9999',
'http://183.154.213.160:9000',
'http://113.128.9.106:9999',
'http://124.42.68.152:90',
'http://49.70.48.50:9999',
'http://113.128.11.172:9999',
'http://111.177.177.40:9999',
'http://59.62.83.253:9999',
'http://39.107.84.185:8123',
'http://124.94.195.107:9999',
'http://111.177.160.132:9999',
'http://120.25.203.182:7777'
]
USER_AGENT_LIST = [
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 OPR/26.0.1656.60',
'Opera/8.0 (Windows NT 5.1; U; en)',
'Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; en) Opera 9.50',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0',
'Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.133 Safari/534.16',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)',
'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; SE 2.X MetaSr 1.0)',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.4.3.4000 Chrome/30.0.1599.101 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36'
]
userinfo.py
# -*- coding: utf-8 -*-
import scrapy
import time
from scrapy import Request
from pymongo import MongoClient
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider,Rule
from scrapy.http import FormRequest
from segmentfault.items import SegmentfaultItem
class UserinfoSpider(CrawlSpider):
name = 'userinfo'
allowed_domains = ['segmentfault.com']
start_urls = ['https://segmentfault.com/u/mybigbigcat/users/following']
rules = (
# 用戶主頁地址,跟進并進行解析
Rule(LinkExtractor(allow=r'/u/\w+$'),callback='parse_item',follow=True),
# 用戶關注列表,跟進列表頁面,抓取用戶主頁地址進行后續(xù)操作
# Rule(LinkExtractor(allow=r'/users/followed$'),follow=True),
# 用戶粉絲列表,跟進列表頁面,抓取用戶主頁地址進行后續(xù)操作
Rule(LinkExtractor(allow=r'/users/following$'),follow=True),
# 跟進其他頁面地址
# Rule(LinkExtractor(allow=r'/users/[followed|following]?page=\d+'),follow=True),
)
def start_requests(self):
# 從MongoDB中獲取一條cookie,添加到開始方法
client = MongoClient(self.crawler.settings['MONGO_URI'])
db = client[self.crawler.settings['MONGO_DB']]
cookies_collection = db.cookies
# 獲取一條cookie
cookies = cookies_collection.find_one()
# cookie中的'Hm_lpvt_e23800c454aa573c0ccb16b52665ac26'參數(shù)是當前時間的10位表示法,因此重新填充
cookies['Hm_lpvt_e23800c454aa573c0ccb16b52665ac26'] = str(int(time.time()))
return [Request("https://segmentfault.com",
cookies=cookies,
meta={'cookiejar':1},
callback=self.after_login)]
# 登錄之后從start_url中開始抓取數(shù)據(jù)
def after_login(self,response):
for url in self.start_urls:
return self.make_requests_from_url(url)
# def after_login(self,response):
# yield Request(self.start_urls[0],
# meta={'cookiejar':response.meta['cookiejar']},
# callback=self.parse_item)
def parse_item(self, response):
"""
:param response:
:return:
"""
item = SegmentfaultItem()
# 個人屬性模塊
profile_head = response.css('.profile__heading')
# 姓名
item['name'] = profile_head.css('h2[class*=name]::text').re_first(r'\w+')
# 聲望
item['rank'] = profile_head.css('.profile__rank-btn > span::text').extract_first()
# 學校專業(yè)信息
school_info = profile_head.css('.profile__school::text').extract()
if school_info:
# 學校
item['school'] = school_info[0]
# 專業(yè)
item['majors'] = school_info[1].strip()
else:
item['school'] = ''
item['majors'] = ''
# 公司職位信息
company_info = profile_head.css('.profile__company::text').extract()
if company_info:
# 公司
item['company'] = company_info[0]
# 職位
item['job'] = company_info[1].strip()
else:
item['company'] = ''
item['job'] = ''
# 個人博客
item['blog'] = profile_head.css('a[class*=other-item-link]::attr(href)').extract_first()
# 統(tǒng)計面板模塊
profile_active = response.xpath("http://div[@class='col-md-2']")
# 關注人數(shù)
item['following'] = profile_active.css('div[class*=info] a > .h5::text').re(r'\d+')[0]
# 粉絲人數(shù)
item['fans'] = profile_active.css('div[class*=info] a > .h5::text').re(r'\d+')[1]
# 回答問題數(shù)
item['answers'] = profile_active.css('a[href*=answer] .count::text').re_first(r'\d+')
# 提問數(shù)
item['questions'] = profile_active.css('a[href*=questions] .count::text').re_first(r'\d+')
# 文章數(shù)
item['articles'] = profile_active.css('a[href*=articles] .count::text').re_first(r'\d+')
# 講座數(shù)
item['lives'] = profile_active.css('a[href*=lives] .count::text').re_first(r'\d+')
# 徽章數(shù)
item['badges'] = profile_active.css('a[href*=badges] .count::text').re_first(r'\d+')
# 徽章詳細頁面地址
badge_url = profile_active.css('a[href*=badges]::attr(href)').extract_first()
# 技能面板模塊
profile_skill = response.xpath("http://div[@class='col-md-3']")
# 技能標簽列表
item['skills'] = profile_skill.css('.tag::text').re(r'\w+')
# 獲得的點贊數(shù)
item['like'] = profile_skill.css('.authlist').re_first(r'獲得 (\d+) 次點贊')
# 注冊日期
item['register_date'] = profile_skill.css('.profile__skill--other p::text').extract_first()
# if register_time:
# item['register_date'] = ''.join(re.findall(r'\d+',register_time))
# else:
# item['register_date'] = ''
# 產(chǎn)出數(shù)據(jù)模塊
profile_work = response.xpath("http://div[@class='col-md-7']")
# 回答獲得的最高分
item['answers_top_score'] = profile_work.css('#navAnswer .label::text').re_first(r'\d+')
# 最高分回答對應的問題的標題
item['answers_top_title'] = profile_work.css('#navAnswer div[class*=title-warp] > a::text').extract_first()
# 最高分回答對應的問題的url
answer_url = profile_work.css('#navAnswer div[class*=title-warp] > a::attr(href)').extract_first()
# 將需要繼續(xù)跟進抓取數(shù)據(jù)的url與item作為參數(shù)傳遞給相應方法繼續(xù)抓取數(shù)據(jù)
request = scrapy.Request(
# 問題詳細頁url
url=response.urljoin(answer_url),
meta={
# item需要傳遞
'item':item,
# 徽章的url
'badge_url':response.urljoin(badge_url)},
# 調用parse_ansser繼續(xù)處理
callback=self.parse_answer)
yield request
def parse_answer(self,response):
# 取出傳遞的item
item = response.meta['item']
# 取出傳遞的徽章詳細頁url
badge_url = response.meta['badge_url']
# 問題標簽列表
item['answers_top_tags'] = response.css('.question__title--tag .tag::text').re(r'\w+')
# 先獲取組成問題內容的字符串列表
question_content = response.css('.widget-question__item p').re(r'>(.*?)<')
# 拼接后傳入item
item['answers_top_question'] = ''.join(question_content)
# 先獲取組成答案的字符串列表
answer_content = response.css('.qa-answer > article .answer').re(r'>(.*?)<')
# 拼接后傳入item
item['answers_top_content'] = ''.join(answer_content)
# 問題頁面內容抓取后繼續(xù)抓取徽章頁內容,并將更新后的item繼續(xù)傳遞
request = scrapy.Request(url=badge_url,
meta={'item':item},
callback=self.parse_badge)
yield request
def parse_badge(self,response):
item = response.meta['item']
badge_name = response.css('span.badge span::text').extract()
badge_count = response.css('span[class*=badges-count]::text').re(r'\d+')
name_count = {}
for i in range(len(badge_count)):
name_count[badge_name[i]] = badge_count[i]
item['badges'] = name_count
yield item
middlewars.py
# -*- coding: utf-8 -*-
# Define here the models for your spider middleware
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
import random
import re
import datetime
import scrapy
import logging
import time
from scrapy.conf import settings
from pymongo import MongoClient
from scrapy.downloadermiddlewares.httpproxy import HttpProxyMiddleware
import pymongo
logger = logging.getLogger(__name__)
class SegmentfaultSpiderMiddleware(object):
"""
處理Item中保存的三種類型注冊日期數(shù)據(jù):
1. 注冊于 2015年12月12日
2. 注冊于 3 天前
3. 注冊于 5 小時前
"""
def process_spider_output(self,response,result,spider):
"""
輸出response時調用此方法處理item中register_date
:param response:
:param result: 包含item
:param spider:
:return:處理過注冊日期的item
"""
for item in result:
# 判斷獲取的數(shù)據(jù)是否是scrapy.item類型
if isinstance(item,scrapy.Item):
# 獲取當前時間
now = datetime.datetime.now()
register_date = item['register_date']
logger.info("獲取注冊日志格式為{}".format(register_date))
# 提取注冊日期字符串,如'注冊于2015年12月12日' => '20151212'
day = ''.join(re.findall(r'\d+',register_date))
# 如果提取數(shù)字字符串長度大于4位,則為'注冊于2015年12月12日'形式
if len(day) > 4:
date = day
# 如果‘時'在提取的字符串中,則為'注冊于8小時前'形式
elif '時' in register_date:
d = now - datetime.timedelta(hours=int(day))
date = d.strftime("%Y%m%d")
# 最后一種情況就是'注冊于3天前'形式
else:
d = now - datetime.timedelta(days=int(day))
date = d.strftime("%Y%m%d")
# 更新register_date值
item['register_date'] = date
yield item
class SegmentfaultHttpProxyMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the downloader middleware does not modify the
# passed objects.
def __init__(self):
self.proxy_list = settings['PROXY_LIST']
def process_request(self, request, spider):
proxy = random.choice(self.proxy_list)
logger.info('使用代理:{}'.format(proxy))
request.meta['proxy'] = proxy
class SegmentfaultUserAgentMiddleware(object):
def __init__(self):
self.useragent_list = settings['USER_AGENT_LIST']
def process_request(self,request,spider):
user_agent = random.choice(self.useragent_list)
# logger.info('使用的USE USER-AGENT:{}'.format(user_agent))
request.headers['User-Agent'] = user_agent
class SegmentfaultCookiesMiddleware(object):
client = MongoClient(settings['MONGO_URI'])
db = client[settings['MONGO_DB']]
collection = db['cookies']
def get_cookies(self):
"""
隨機獲取cookies
:return:
"""
cookies = random.choice([cookie for cookie in self.collection.find()])
# 將不需要的"_id"與"_gat"參數(shù)刪除
cookies.pop('_id')
cookies.pop('_gat')
# 將"Hm_lpvt_e23800c454aa573c0ccb16b52665ac26"填充當前時間
cookies['Hm_lpvt_e23800c454aa573c0ccb16b52665ac26'] = str(int(time.time()))
return cookies
def remove_cookies(self,cookies):
"""
刪除已失效的cookies
:param cookies:
:return:
"""
# 隨機獲取cookies中的一對鍵值,返回結果是一個元祖
i = cookies.popitem()
# 刪除cookies
try:
logger.info("刪除cookies{}".format(cookies))
self.collection.remove({i[0]:i[1]})
except Exception as e:
logger.info("No this cookies:{}".format(cookies))
def process_request(self,request,spider):
"""
為每一個request添加一個cookie
:param request:
:param spider:
:return:
"""
cookies = self.get_cookies()
request.cookies = cookies
def process_response(self,request,response,spider):
"""
對于登錄失效的情況,可能會重定向到登錄頁面,這時添加新的cookies繼續(xù),將請求放回調度器
:param request:
:param response:
:param spider:
:return:
"""
if response.status in [301,302]:
logger.info("Redirect response:{}".format(response))
redirect_url = response.headers['location']
if b'/user/login' in redirect_url:
logger.info("Cookies失效")
# 請求失敗,重新獲取一個cookie,添加到request,并停止后續(xù)中間件處理此request,將此request放入調度器
new_cookie = self.get_cookies()
logger.info("獲取新cookie:{}".format(new_cookie))
# 刪除舊cookies
self.remove_cookies(request.cookies)
request.cookies = new_cookie
return request
#
return response
run.py
from scrapy import cmdline
# from segmentfault.get_cookies import GetCookies
from get_cookies import GetCookies
if __name__ == '__main__':
cookies = GetCookies()
cookies.save()
name = 'userinfo'
""
cmd = 'scrapy crawl {}'.format(name)
cmdline.execute(cmd.split())
到此這篇關于Scrapy項目實戰(zhàn)之爬取某社區(qū)用戶詳情的文章就介紹到這了,更多相關Scrapy 爬取某社區(qū)用戶內容請搜索腳本之家以前的文章或繼續(xù)瀏覽下面的相關文章希望大家以后多多支持腳本之家!
相關文章
使用python將mdb數(shù)據(jù)庫文件導入postgresql數(shù)據(jù)庫示例
mdb格式文件可以通過mdbtools工具將內中包含的每張表導出到csv格式文件。由于access數(shù)據(jù)庫和postgresQL數(shù)據(jù)庫格式上會存在不通性,所以使用python的文件處理,將所得csv文件修改成正確、能識別的格式2014-02-02
python grpc實現(xiàn)異步調用(不用grpc異步接口)
grpc同步調用更簡單,但是在處理復雜任務時,會導致請求阻塞,影響吞吐,本文主要介紹了python grpc實現(xiàn)異步調用,不用grpc異步接口,具有一定的參考價值,感興趣的可以了解一下2024-04-04
如何優(yōu)雅地處理Django中的favicon.ico圖標詳解
默認情況下,瀏覽器訪問一個網(wǎng)站的時候,同時還會向服務器請求"/favicon.ico"這個URL,目的是獲取網(wǎng)站的圖標,下面這篇文章主要給大家介紹了關于如何優(yōu)雅地處理Django中favicon.ico圖標的相關資料,需要的朋友可以參考下2018-07-07

