scrapy-spalsh使用UA和IP代理
核心
设置UA,优先在lua脚本中使用splash:set_user_agent(“{ua}”)
设置ip代理,使用SplashRequest的proxy
代码
1
| pip install fake-useragent
|
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49
| import scrapy from scrapy import Spider from urllib.parse import quote from scrapy_splash import SplashRequest
from risk_control_info.utils import get_proxy_ip from fake_useragent import UserAgent
ua = UserAgent() script = """ function main(splash, args) splash.images_enabled = false splash:set_user_agent("{ua}") assert(splash:go(args.url)) assert(splash:wait(args.wait)) return splash:html() end""".format(ua=ua.chrome)
class AppQimaiHotSearchSpider(scrapy.Spider): name = 'app_qimai_hot_search' allowed_domains = ['qimai.cn'] user_agent = ua.chrome custom_settings = { 'DOWNLOADER_MIDDLEWARES': { 'scrapy_splash.SplashCookiesMiddleware': 723, 'scrapy_splash.SplashMiddleware': 725, 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810, }, 'SPIDER_MIDDLEWARES': { 'scrapy_splash.SplashDeduplicateArgsMiddleware': 100, }, }
def start_requests(self): url = "http://httpbin.org/get"
yield SplashRequest(url=url, callback=self.parse, endpoint='execute', args={ 'lua_source': script, 'proxy': "http://" + get_proxy_ip(url), 'wait': 3})
def parse(self, response): print(response.body.decode())
|
结果
