centos 8 docker 搭建 chrome or opera + python+selenium webdriver环境,实现网络数据爬虫

yum update
#使用centos 8

#安装python及包扩展工具pip
yum install python38

#安装完成后 查看版本
[root@7c73e1180bfb ~]# python3.8 -V
Python 3.8.0

[root@7c73e1180bfb ~]# pip3.8 -V
pip 19.2.3 from /usr/lib/python3.8/site-packages/pip (python 3.8)

#不同系统版本,包依赖可能存在差别,如果无法安装请移步到编译安装
linux(centos)安装python
#安装selenium包 pip3.8 install selenium OR #国内下载慢,使用国内镜像安装 pip3.8 install selenium -i https://pypi.tuna.tsinghua.edu.cn/simple --trusted-host pypi.tuna.tsinghua.edu.cn
#下载Opera浏览器
# https://download4.operacdn.com/ftp/pub/opera/desktop/
#下载
wget https://download4.operacdn.com/ftp/pub/opera/desktop/70.0.3728.95/linux/opera-stable_70.0.3728.95_amd64.rpm

#安装
yum localinstall opera-stable_70.0.3728.95_amd64.rpm

#查看浏览器版本
[root@7c73e1180bfb ~]# opera -version
70.0.3728.95

#安装浏览器驱动,选择对应版本驱动
https://github.com/operasoftware/operachromiumdriver/releases

wget https://github.com/operasoftware/operachromiumdriver/releases/download/v.84.0.4147.89/operadriver_linux64.zip

unzip operadriver_linux64.zip

cp operadriver /usr/bin/operadriver 
#下载google chrome浏览器
# https://www.chrome64bit.com/index.php/google-chrome-64-bit-for-linux
#下载
wget https://dl.google.com/linux/direct/google-chrome-stable_current_x86_64.rpm

#安装
yum install google-chrome-stable_current_x86_64.rpm

#查看浏览器版本
[root@d289bf70da9a ~]# google-chrome --version
Google Chrome 85.0.4183.83 

#安装浏览器驱动,选择对应版本驱动
https://npm.taobao.org/mirrors/chromedriver/

wget https://cdn.npm.taobao.org/dist/chromedriver/85.0.4183.87/chromedriver_linux64.zip

unzip chromedriver_linux64.zip

cp chromedriver /usr/bin/chromedriver

#查看驱动版本
[root@d289bf70da9a ~]# chromedriver --version
ChromeDriver 85.0.4183.87 (cd6713ebf92fa1cacc0f1a598df280093af0c5d7-refs/branch-heads/4183@{#1689})
新建 webdriver.py  文件  格式与下面保持一致防止执行报错
import io
import sys
import time
from selenium import webdriver
from selenium.webdriver.chrome.options import Options

sys.stdout = io.TextIOWrapper(sys.stdout.buffer,encoding='utf8') #改变标准输出的默认编码

#这将使Selenium WebDriver等待直到完全加载并解析了初始HTML文档,并放弃了样式表,图像和子帧的加载。
#设置为eager时,Selenium WebDriver等待直到 DOMContentLoaded 返回事件
options = Options()
options.page_load_strategy = 'none'
options.add_argument("--no-sandbox")
options.add_argument('--headless')
driver = webdriver.Chrome(options=options)


#最大化窗口
driver.maximize_window()

for num in range(1,14):
    driver.get("https://www.amazon.com/s?k=keyboard&page=%s&qid=%s&ref=sr_pg_3" %(num,time.time()))
    html_source = driver.page_source
    print(html_source)
    print(driver.current_url)
    print(driver.get_cookies())


#采集完成关闭浏览器
driver.close()
driver.quit()
#执行输出
python3.8 webdriver.py