在docker中启动selenium gird使用扩展,并使用隧道代理,比如阿布云、多贝云、蘑菇代理。how to set proxy with authentication in selenium chromedriver python
proxy with authentication(账号密码认证代理)不支持chrome headless,但是对docker selenium 或者 selenium gird集群,是支持的。
启动selenium docker
1
| docker run -d -p 4444:4444 --shm-size=2g -m 800M --memory-swap=800M --name=chrome --restart=always selenium/standalone-chrome
|
一、selenium使用隧道动态代理(会生成本地zip插件文件)
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114
| import os import time import zipfile
from selenium import webdriver from scrapy.selector import Selector
PROXY_HOST = 'http-dyn.abuyun.com' PROXY_PORT = 9020 PROXY_USER = '' PROXY_PASS = ''
REMOTE_SELENIUM = '111.22.111.11:4444'
manifest_json = """ { "version": "1.0.0", "manifest_version": 2, "name": "Chrome Proxy", "permissions": [ "proxy", "tabs", "unlimitedStorage", "storage", "<all_urls>", "webRequest", "webRequestBlocking" ], "background": { "scripts": ["background.js"] }, "minimum_chrome_version":"22.0.0" } """
background_js = """ var config = { mode: "fixed_servers", rules: { singleProxy: { scheme: "http", host: "%s", port: parseInt(%s) }, bypassList: ["localhost"] } };
chrome.proxy.settings.set({value: config, scope: "regular"}, function() {});
function callbackFn(details) { return { authCredentials: { username: "%s", password: "%s" } }; }
chrome.webRequest.onAuthRequired.addListener( callbackFn, {urls: ["<all_urls>"]}, ['blocking'] ); """ % (PROXY_HOST, PROXY_PORT, PROXY_USER, PROXY_PASS)
def get_chromedriver(use_proxy=False, user_agent=None, use_docker=True): path = os.path.dirname(os.path.abspath(__file__)) chrome_options = webdriver.ChromeOptions() if use_proxy: pluginfile = 'proxy_auth_plugin.zip'
with zipfile.ZipFile(pluginfile, 'w') as zp: zp.writestr("manifest.json", manifest_json) zp.writestr("background.js", background_js) chrome_options.add_extension(pluginfile) if user_agent: chrome_options.add_argument('--user-agent=%s' % user_agent) if use_docker: driver = webdriver.Remote( command_executor="http://{}/wd/hub".format(REMOTE_SELENIUM), options=chrome_options ) else: driver = webdriver.Chrome( os.path.join(path, '/usr/local/bin/chromedriver'), chrome_options=chrome_options) return driver
def main(): driver = get_chromedriver(use_proxy=True, use_docker=True) print(driver) n = 0 while True: driver.get('https://www.cip.cc') ip_text = Selector(text=driver.page_source).xpath( '//pre/text()').extract_first().strip() print(ip_text) driver.close() time.sleep(3) n += 1 if n > 10: break driver.quit()
if __name__ == '__main__': main()
|
效果图

二、selenium 使用芝麻代理等常规HOST:PORT代理
1 2 3 4 5 6 7 8 9
| from selenium import webdriver PROXY = "88.157.149.250:8080"
chrome_options = webdriver.ChromeOptions() chrome_options.add_argument('--proxy-server=%s' % PROXY)
chrome = webdriver.Chrome(chrome_options=chrome_options) chrome.get("http://www.cip.cc") print(chrome.page_source)
|