使用Selenium反爬(美团)
美团的反爬机制是非常完善的,在用selenium登陆淘宝的时候发现美团能检测到并弹出滑块,然后无论怎么滑动都通过不了,在经过一番搜索后发现很多网站对selenium都有检测机制,如检测是否存在特有标识 。接下来我们简单分享下使用代理访问美团进行数据采集。
示例如下:
# -*- coding:UTF-8 -*-
import time
import refrom datetime
import date, timedeltafrom selenium
import webdriverfrom selenium.common.exceptions
import NoSuchElementExceptionfrom selenium.webdriver
import ActionChainsfrom selenium.webdriver.chrome.options
import OptionsTB_LOGIN_URL = 'https://meituan.com'CHROME_DRIVER = '/usr/local/bin/chromedriver'
# Windows和Mac的配置路径不一样
class SessionException(Exception): """
会话异常类 """
def __init__(self, message):
super().__init__(self)
self.message = message
def __str__(self):
return self.messageclass Crawler:
def __init__(self):
self.browser = None
def start(self, username, password):
print("初始化浏览器")
self.__init_browser()
print("切换至密码输入框")
self.__switch_to_password_mode()
time.sleep(0.5)
print("输入用户名")
self.__write_username(username)
time.sleep(2.5)
print("输入密码")
self.__write_password(password)
time.sleep(3.5)
print("程序模拟解锁")
if self.__lock_exist():
self.__unlock()
print("开始发起登录请求")
self.__submit()
time.sleep(4.5)
# 登录成功,直接请求页面
print("登录成功,跳转至目标页面")
self.__navigate_to_target_page()
time.sleep(6.5)
print("解析页面文本")
crawler_list = self.__parse_page_content();
# 连接数据库并保存数据
print("保存数据到mysql数据库")
self.__save_list_to_db(crawler_list)
def __switch_to_password_mode(self):
切换到密码模式
:return: """
if self.browser.find_element_by_id('J_QRCodeLogin').is_displayed():
self.browser.find_element_by_id('J_Quick2Static').click()
def __write_username(self, username): """
输入账号 :param username: :return: """
username_input_element = self.browser.find_element_by_id('TPL_username_1')
username_input_element.clear()
username_input_element.send_keys(username)
def __write_password(self, password):
"" 输入密码
:param password:
:return: """
password_input_element = self.browser.find_element_by_id("TPL_password_1")
password_input_element.clear()
password_input_element.send_keys(password)
def __lock_exist(self): """
判断是否存在滑动验证
:return: """
return self.__is_element_exist('#nc_1_wrapper') and self.browser.find_element_by_id( 'nc_1_wrapper').is_displayed() def __unlock(self): """
执行滑动解锁 :return: """ bar_element = self.browser.find_element_by_id('nc_1_n1z') ActionChains(self.browser).drag_and_drop_by_offset(bar_element, 800, 0).perform()
time.sleep(1.5) self.browser.get_screenshot_as_file('error.png')
if self.__is_element_exist('.errloading > span'):
error_message_element = self.browser.find_element_by_css_selector('.errloading > span') error_message = error_message_element.text self.browser.execute_script('noCaptcha.reset(1)') raise SessionException('滑动验证失败, message = ' + error_message) def __submit(self): """ 提交登录 :return: """ self.browser.find_element_by_id('J_SubmitStatic').click() time.sleep(0.5) if self.__is_element_exist("#J_Message"): error_message_element = self.browser.find_element_by_css_selector('#J_Message > p') error_message = error_message_element.text raise SessionException('登录出错, message = ' + error_message) #跳转至目标页面 def __navigate_to_target_page(self): pass # 解析网页数据 def __parse_page_content(self): pass #保存数据 def __save_list_to_db(self, crawler_list): pass def __init_browser(self): """ 初始化selenium浏览器 :return: """ options = Options() # options.add_argument("--headless") prefs = {"profile.managed_default_content_settings.images": 1} options.add_experimental_option("prefs", prefs) options.add_argument('--proxy-server=http://127.0.0.1:9000') options.add_argument('disable-infobars') options.add_argument('--no-sandbox') self.browser = webdriver.Chrome(executable_path=CHROME_DRIVER, options=options) self.browser.implicitly_wait(3) self.browser.maximize_window() self.browser.get(TB_LOGIN_URL)#执行命令行Crawler().start('username'), 'password')使用Selenium反爬(美团)
xiaotaomi
会员积分:7300
美团的反爬机制是非常完善的,在用selenium登陆淘宝的时候发现美团能检测到并弹出滑块,然后无论怎么滑动都通过不了,在经过一番搜索后发现很多网站对selenium都有检测机制,如检测是否存在特有标识 。接下来我们简单分享下使用代理访问美团进行数据采集。
示例如下:
# -*- coding:UTF-8 -*-
import time
import refrom datetime
import date, timedeltafrom selenium
import webdriverfrom selenium.common.exceptions
import NoSuchElementExceptionfrom selenium.webdriver
import ActionChainsfrom selenium.webdriver.chrome.options
import OptionsTB_LOGIN_URL = 'https://meituan.com'CHROME_DRIVER = '/usr/local/bin/chromedriver'
# Windows和Mac的配置路径不一样
class SessionException(Exception): """
会话异常类 """
def __init__(self, message):
super().__init__(self)
self.message = message
def __str__(self):
return self.messageclass Crawler:
def __init__(self):
self.browser = None
def start(self, username, password):
print("初始化浏览器")
self.__init_browser()
print("切换至密码输入框")
self.__switch_to_password_mode()
time.sleep(0.5)
print("输入用户名")
self.__write_username(username)
time.sleep(2.5)
print("输入密码")
self.__write_password(password)
time.sleep(3.5)
print("程序模拟解锁")
if self.__lock_exist():
self.__unlock()
print("开始发起登录请求")
self.__submit()
time.sleep(4.5)
# 登录成功,直接请求页面
print("登录成功,跳转至目标页面")
self.__navigate_to_target_page()
time.sleep(6.5)
print("解析页面文本")
crawler_list = self.__parse_page_content();
# 连接数据库并保存数据
print("保存数据到mysql数据库")
self.__save_list_to_db(crawler_list)
def __switch_to_password_mode(self):
切换到密码模式
:return: """
if self.browser.find_element_by_id('J_QRCodeLogin').is_displayed():
self.browser.find_element_by_id('J_Quick2Static').click()
def __write_username(self, username): """
输入账号 :param username: :return: """
username_input_element = self.browser.find_element_by_id('TPL_username_1')
username_input_element.clear()
username_input_element.send_keys(username)
def __write_password(self, password):
"" 输入密码
:param password:
:return: """
password_input_element = self.browser.find_element_by_id("TPL_password_1")
password_input_element.clear()
password_input_element.send_keys(password)
def __lock_exist(self): """
判断是否存在滑动验证
:return: """
return self.__is_element_exist('#nc_1_wrapper') and self.browser.find_element_by_id( 'nc_1_wrapper').is_displayed() def __unlock(self): """
执行滑动解锁 :return: """ bar_element = self.browser.find_element_by_id('nc_1_n1z') ActionChains(self.browser).drag_and_drop_by_offset(bar_element, 800, 0).perform()
time.sleep(1.5) self.browser.get_screenshot_as_file('error.png')
if self.__is_element_exist('.errloading > span'):
error_message_element = self.browser.find_element_by_css_selector('.errloading > span') error_message = error_message_element.text self.browser.execute_script('noCaptcha.reset(1)') raise SessionException('滑动验证失败, message = ' + error_message) def __submit(self): """ 提交登录 :return: """ self.browser.find_element_by_id('J_SubmitStatic').click() time.sleep(0.5) if self.__is_element_exist("#J_Message"): error_message_element = self.browser.find_element_by_css_selector('#J_Message > p') error_message = error_message_element.text raise SessionException('登录出错, message = ' + error_message) #跳转至目标页面 def __navigate_to_target_page(self): pass # 解析网页数据 def __parse_page_content(self): pass #保存数据 def __save_list_to_db(self, crawler_list): pass def __init_browser(self): """ 初始化selenium浏览器 :return: """ options = Options() # options.add_argument("--headless") prefs = {"profile.managed_default_content_settings.images": 1} options.add_experimental_option("prefs", prefs) options.add_argument('--proxy-server=http://127.0.0.1:9000') options.add_argument('disable-infobars') options.add_argument('--no-sandbox') self.browser = webdriver.Chrome(executable_path=CHROME_DRIVER, options=options) self.browser.implicitly_wait(3) self.browser.maximize_window() self.browser.get(TB_LOGIN_URL)#执行命令行Crawler().start('username'), 'password')
21-04-13 17:04

1919

0
回复
暂无评论