修复豆瓣根据排行榜和关键字搜索功能
This commit is contained in:
parent
97f3342fb5
commit
16e15236ae
|
@ -6,17 +6,6 @@
|
|||
这个项目源于大三某课程设计。平常经常需要搜索一些电影,但是不知道哪些评分高且评价人数多的电影。为了方便使用,就将原来的项目重新改写了。当做是对爬虫技术、可视化技术的实践了。主要是通过从排行榜和从影片关键词两种方式爬取电影数据。
|
||||
|
||||
|
||||
|
||||
## 配置说明
|
||||
|
||||
1. 打开[http://chromedriver.storage.googleapis.com/index.html][1],根据自己的操作系统下载对应的chromedriver
|
||||
2. 打开当前面目录下的**<u>getMovieInRankingList.py</u>**,定位到第59行,将`executable_path=/Users/bird/Desktop/chromedriver.exe`修改成你自己的chromedriver路径
|
||||
3. 打开pycharm,依次安装以下包
|
||||
- pip install Pillow
|
||||
- pip install selenium
|
||||
|
||||
|
||||
|
||||
## 功能截图
|
||||
|
||||
![](example_rating.png)
|
||||
|
@ -24,6 +13,14 @@
|
|||
![](example_keyword.png)
|
||||
|
||||
|
||||
## 如何运行
|
||||
|
||||
1. 打开Chrome浏览器,在网址栏输入chrome://version/查询当前Chrome版本
|
||||
2. 打开[http://chromedriver.storage.googleapis.com/index.html][1],下载对应版本的chromedriver驱动,**下载完成后务必解压**
|
||||
3. 打开当前目录下的文件`getMovieInRankingList.py`,定位到第`59行`,将`executable_path=./chromedriver.exe`修改为你的chromedriver驱动路径
|
||||
4. 执行命令`pip install -r requirement.txt`安装程序所需的依赖包
|
||||
5. 执行命令`python main.py`运行程序
|
||||
|
||||
|
||||
## 包含功能
|
||||
|
||||
|
@ -38,7 +35,7 @@
|
|||
|
||||
## 存在问题
|
||||
|
||||
目前没有加入反爬虫策略,如果运行出现403 forbidden提示,则说明暂时被禁止,解决方式如下:
|
||||
目前没有加入反爬虫策略,如果运行出现403 forbidden提示,则说明暂时被禁止使用,解决方式如下:
|
||||
|
||||
- 加入cookies
|
||||
- 采用随机延时方式
|
||||
|
|
Binary file not shown.
|
@ -43,116 +43,141 @@ movieData = ' [' \
|
|||
' {"title":"黑色电影", "type":"31", "interval_id":"100:90"}' \
|
||||
']'
|
||||
|
||||
def get_url_data_in_ranking_list(typeId, movie_count, rating, vote_count):
|
||||
"""
|
||||
从排行榜中获取电影数据
|
||||
typeId 电影类型, movie_count 获取的该电影类型的数量, rating 电影的评分, vote_count 电影的评价人数
|
||||
|
||||
class getMovieInRankingList:
|
||||
:param typeId:
|
||||
:param movie_count:
|
||||
:param rating:
|
||||
:param vote_count:
|
||||
:return:
|
||||
"""
|
||||
|
||||
# typeId 电影类型, movie_count 欲获取的该电影类型的数量, rating 电影的评分, vote_count 电影的评价人数
|
||||
def __init__(self):
|
||||
chrome_options = Options()
|
||||
chrome_options.add_argument('--headless') # 设置为无头模式,即不显示浏览器
|
||||
chrome_options.add_argument(
|
||||
'user-agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36"') # 设置user=agent
|
||||
chrome_options.add_experimental_option('excludeSwitches',['enable-automation']) # 此步骤很重要,设置为开发者模式,防止被各大网站识别出来使用了Selenium
|
||||
chrome_options.add_experimental_option("prefs",{"profile.managed_default_content_settings.images": 2}) # 不加载图片,加快访问速度
|
||||
|
||||
try:
|
||||
self.browser = webdriver.Chrome(executable_path='/Users/bird/Desktop/chromedriver.exe',chrome_options=chrome_options) # 设置chromedriver路径
|
||||
self.wait = WebDriverWait(self.browser, 10) # 超时时长为10s
|
||||
except:
|
||||
print("chromedriver.exe出错,请检查是否与你的chrome浏览器版本相匹配\n缺失chromedriver.exe不会导致从排行榜搜索功能失效,但会导致从关键字搜索功能失效")
|
||||
|
||||
# 从排行榜中获取电影数据
|
||||
def get_url_data_in_ranking_list(self, typeId, movie_count, rating, vote_count):
|
||||
try:
|
||||
context = _create_unverified_context() # 屏蔽ssl证书
|
||||
headers = {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36', }
|
||||
url = 'https://movie.douban.com/j/chart/top_list?type=' + str(
|
||||
typeId) + '&interval_id=100:90&action=unwatched&start=0&limit=' + str(movie_count)
|
||||
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'}
|
||||
url = 'https://movie.douban.com/j/chart/top_list?type=' + str(typeId) + '&interval_id=100:90&action=unwatched&start=0&limit=' + str(movie_count)
|
||||
req = urllib.request.Request(url=url, headers=headers)
|
||||
f = urllib.request.urlopen(req, context=context)
|
||||
response = f.read()
|
||||
jsonData = loads(response) # 将json转为python对象
|
||||
|
||||
list = []
|
||||
res_list = []
|
||||
for subData in jsonData: # 依次对每部电影进行操作
|
||||
if ((float(subData['rating'][0]) >= float(rating)) and (float(subData['vote_count']) >= float(vote_count))):
|
||||
subList = []
|
||||
subList.append(subData['title'])
|
||||
subList.append(subData['rating'][0])
|
||||
subList.append(subData['rank'])
|
||||
subList.append(subData['vote_count'])
|
||||
list.append(subList)
|
||||
if (float(subData['rating'][0]) >= float(rating)) and (float(subData['vote_count']) >= float(vote_count)):
|
||||
sub_list= []
|
||||
sub_list.append(subData['title'])
|
||||
sub_list.append(subData['rating'][0])
|
||||
sub_list.append(subData['rank'])
|
||||
sub_list.append(subData['vote_count'])
|
||||
res_list.append(sub_list)
|
||||
|
||||
for data in list:
|
||||
for data in res_list:
|
||||
print(data)
|
||||
|
||||
return list, jsonData
|
||||
return [res_list, jsonData]
|
||||
|
||||
# 从关键字获取电影数据
|
||||
def get_url_data_in_keyWord(self, key_word):
|
||||
|
||||
# 浏览网页
|
||||
self.browser.get('https://movie.douban.com/subject_search?search_text=' + urllib.parse.quote(
|
||||
key_word) + '&cat=1002') # get方式获取返回数据
|
||||
|
||||
# js动态渲染的网页,必须等到搜索结果元素(DIV中class=root)出来后,才可以停止加载网页
|
||||
# 等待DIV中class=root的元素出现
|
||||
self.wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '.root')))
|
||||
|
||||
dr = self.browser.find_elements_by_xpath("//div[@class='item-root']") # 获取class为item-root的DIV(因为有多个结果)
|
||||
jsonData = []
|
||||
list = []
|
||||
for son in dr:
|
||||
movieData = {'rating': ['', 'null'], 'cover_url': '', 'types': '', 'title': '', 'url': '',
|
||||
'release_date': '', 'vote_count': '', 'actors': ''}
|
||||
subList = ['', '', '', '']
|
||||
|
||||
url_element = son.find_elements_by_xpath(".//a") # 获取第一个a标签的url(因为有多个结果)
|
||||
if (url_element):
|
||||
movieData['url'] = (url_element[0].get_attribute("href"))
|
||||
|
||||
img_url_element = url_element[0].find_elements_by_xpath(".//img") # 获取影片海报图片地址
|
||||
if (img_url_element):
|
||||
movieData['cover_url'] = (img_url_element[0].get_attribute("src"))
|
||||
|
||||
title_element = son.find_elements_by_xpath(".//div[@class='title']") # 获取标题
|
||||
if (title_element):
|
||||
temp_title = (title_element[0].text)
|
||||
movieData['title'] = (temp_title.split('('))[0]
|
||||
movieData['release_date'] = temp_title[temp_title.find('(') + 1:temp_title.find(')')]
|
||||
subList[0] = movieData['title']
|
||||
|
||||
rating_element = son.find_elements_by_xpath(".//span[@class='rating_nums']") # 获取评分
|
||||
if (rating_element):
|
||||
movieData['rating'][0] = (rating_element[0].text)
|
||||
subList[1] = movieData['rating'][0]
|
||||
|
||||
vote_element = son.find_elements_by_xpath(".//span[@class='pl']") # 获取数量
|
||||
if (vote_element):
|
||||
movieData['vote_count'] = (vote_element[0].text).replace('(', '').replace(')', '').replace('人评价', '')
|
||||
subList[3] = movieData['vote_count']
|
||||
|
||||
type_element = son.find_elements_by_xpath(".//div[@class='meta abstract']") # 获取类型
|
||||
if (type_element):
|
||||
movieData['types'] = (type_element[0].text)
|
||||
subList[2] = movieData['types']
|
||||
|
||||
actors_element = son.find_elements_by_xpath(".//div[@class='meta abstract_2']") # 获取演员
|
||||
if (actors_element):
|
||||
movieData['actors'] = (actors_element[0].text)
|
||||
|
||||
jsonData.append(movieData)
|
||||
list.append(subList)
|
||||
|
||||
# 关闭浏览器
|
||||
self.browser.quit()
|
||||
|
||||
for data in list:
|
||||
print(data)
|
||||
|
||||
return list, jsonData
|
||||
except Exception as ex:
|
||||
err_str = "出现未知异常:{}".format(ex)
|
||||
return [err_str]
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
def get_url_data_in_keyWord(key_word):
|
||||
"""
|
||||
从关键字获取电影数据
|
||||
:param key_word:
|
||||
:return:
|
||||
"""
|
||||
|
||||
chrome_options = Options()
|
||||
chrome_options.add_argument('--headless') # 设置为无头模式,即不显示浏览器
|
||||
chrome_options.add_argument('user-agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36"') # 设置user=agent
|
||||
chrome_options.add_experimental_option('excludeSwitches', ['enable-automation']) # 此步骤很重要,设置为开发者模式,防止被各大网站识别出来使用了Selenium
|
||||
chrome_options.add_experimental_option("prefs", {"profile.managed_default_content_settings.images": 2}) # 不加载图片,加快访问速度
|
||||
|
||||
load_driver_success = False # 加载chromedriver驱动是否成功
|
||||
browser = None
|
||||
wait = None
|
||||
try:
|
||||
browser = webdriver.Chrome(executable_path='./chromedriver', chrome_options=chrome_options) # 设置chromedriver驱动路径
|
||||
browser.set_page_load_timeout(10) # 页面加载超时时间为10s
|
||||
browser.set_script_timeout(10) # 页面js加载超时时间为10s
|
||||
|
||||
wait = WebDriverWait(browser, 10) # 等待超时时长为10s
|
||||
load_driver_success = True
|
||||
except Exception as ex:
|
||||
load_driver_success = False
|
||||
err_str = "加载chromedriver驱动失败,请下载chromedriver驱动并填写正确的路径。\n\n异常信息:{}".format(ex)
|
||||
return [err_str]
|
||||
|
||||
|
||||
# 加载chromedriver驱动成功时执行的操作
|
||||
if load_driver_success:
|
||||
|
||||
try:
|
||||
# 浏览网页
|
||||
browser.get('https://movie.douban.com/subject_search?search_text=' + urllib.parse.quote(key_word) + '&cat=1002') # get方式获取返回数据
|
||||
# js动态渲染的网页,必须等到搜索结果元素(DIV中class=root)出来后,才可以停止加载网页
|
||||
# 等待DIV中class=root的元素出现
|
||||
wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '.root')))
|
||||
|
||||
dr = browser.find_elements(by=By.XPATH, value="//div[@class='item-root']") # 获取class为item-root的DIV(因为有多个结果)
|
||||
jsonData = []
|
||||
res_list = []
|
||||
for son in dr:
|
||||
movieData = {'rating': ['', 'null'], 'cover_url': '', 'types': '', 'title': '', 'url': '', 'release_date': '', 'vote_count': '', 'actors': ''}
|
||||
sub_list = ['', '', '', '']
|
||||
|
||||
url_element = son.find_elements(by=By.XPATH, value=".//a") # 获取第一个a标签的url(因为有多个结果)
|
||||
if url_element:
|
||||
movieData['url'] = (url_element[0].get_attribute("href"))
|
||||
|
||||
img_url_element = url_element[0].find_elements(by=By.XPATH, value=".//img") # 获取影片海报图片地址
|
||||
if img_url_element:
|
||||
movieData['cover_url'] = (img_url_element[0].get_attribute("src"))
|
||||
|
||||
title_element = son.find_elements(by=By.XPATH, value=".//div[@class='title']") # 获取标题
|
||||
if title_element:
|
||||
temp_title = title_element[0].text
|
||||
movieData['title'] = (temp_title.split('('))[0]
|
||||
movieData['release_date'] = temp_title[temp_title.find('(') + 1:temp_title.find(')')]
|
||||
sub_list[0] = movieData['title']
|
||||
|
||||
rating_element = son.find_elements(by=By.XPATH, value=".//span[@class='rating_nums']") # 获取评分
|
||||
if rating_element:
|
||||
movieData['rating'][0] = rating_element[0].text
|
||||
sub_list[1] = movieData['rating'][0]
|
||||
|
||||
vote_element = son.find_elements(by=By.XPATH, value=".//span[@class='pl']") # 获取数量
|
||||
if vote_element:
|
||||
movieData['vote_count'] = vote_element[0].text.replace('(', '').replace(')', '').replace('人评价', '')
|
||||
sub_list[3] = movieData['vote_count']
|
||||
|
||||
type_element = son.find_elements(by=By.XPATH, value=".//div[@class='meta abstract']") # 获取类型
|
||||
if type_element:
|
||||
movieData['types'] = type_element[0].text
|
||||
sub_list[2] = movieData['types']
|
||||
|
||||
actors_element = son.find_elements(by=By.XPATH, value=".//div[@class='meta abstract_2']") # 获取演员
|
||||
if actors_element:
|
||||
movieData['actors'] = actors_element[0].text
|
||||
|
||||
jsonData.append(movieData)
|
||||
res_list.append(sub_list)
|
||||
|
||||
for data in res_list:
|
||||
print(data)
|
||||
|
||||
browser.quit() # 关闭浏览器
|
||||
|
||||
return [res_list, jsonData]
|
||||
|
||||
except Exception as ex:
|
||||
browser.quit() # 关闭浏览器
|
||||
err_str = "chromedriver驱动加载成功,但是出现其他未知异常:{}".format(ex)
|
||||
return [err_str]
|
||||
|
|
|
@ -0,0 +1,2 @@
|
|||
Pillow
|
||||
selenium
|
|
@ -1,7 +1,9 @@
|
|||
# -*- coding:utf-8 -*-
|
||||
|
||||
from PIL import Image, ImageTk
|
||||
from getMovieInRankingList import *
|
||||
from getMovieInRankingList import movieData
|
||||
from getMovieInRankingList import get_url_data_in_ranking_list
|
||||
from getMovieInRankingList import get_url_data_in_keyWord
|
||||
from tkinter import Tk
|
||||
from tkinter import ttk
|
||||
from tkinter import font
|
||||
|
@ -18,6 +20,7 @@ from tkinter import NS
|
|||
from tkinter import NW
|
||||
from tkinter import N
|
||||
from tkinter import Y
|
||||
from tkinter import messagebox
|
||||
from tkinter import DISABLED
|
||||
from tkinter import NORMAL
|
||||
from re import findall
|
||||
|
@ -25,6 +28,7 @@ from json import loads
|
|||
from threading import Thread
|
||||
from urllib.parse import quote
|
||||
from webbrowser import open
|
||||
import urllib
|
||||
import os
|
||||
import ssl
|
||||
ssl._create_default_https_context = ssl._create_unverified_context #关闭SSL证书验证
|
||||
|
@ -32,7 +36,6 @@ ssl._create_default_https_context = ssl._create_unverified_context #关闭SSL证
|
|||
|
||||
|
||||
|
||||
|
||||
def thread_it(func, *args):
|
||||
'''
|
||||
将函数打包进线程
|
||||
|
@ -301,7 +304,7 @@ class uiObject:
|
|||
self.clear_tree(self.treeview_bt_download)
|
||||
|
||||
item = self.treeview.selection()
|
||||
if(item):
|
||||
if item:
|
||||
item_text = self.treeview.item(item, "values")
|
||||
movieName = item_text[0] # 输出电影名
|
||||
for movie in self.jsonData:
|
||||
|
@ -402,12 +405,20 @@ class uiObject:
|
|||
jsonMovieData = loads(movieData)
|
||||
for subMovieData in jsonMovieData:
|
||||
if(subMovieData['title'] == self.C_type.get()):
|
||||
res_data = get_url_data_in_ranking_list(subMovieData['type'], self.T_count.get(), self.T_rating.get(), self.T_vote.get()) # 返回符合条件的电影信息
|
||||
if len(res_data) == 2:
|
||||
# 获取数据成功
|
||||
res_list = res_data[0]
|
||||
jsonData = res_data[1]
|
||||
|
||||
self.jsonData = jsonData
|
||||
self.add_tree(res_list, self.treeview) # 将数据添加到tree中
|
||||
|
||||
else:
|
||||
# 获取数据失败,出现异常
|
||||
err_str = res_data[0]
|
||||
messagebox.showinfo('提示', err_str[:1000])
|
||||
|
||||
movieObject = getMovieInRankingList() #创建对象
|
||||
list,jsonData = movieObject.get_url_data_in_ranking_list(subMovieData['type'], self.T_count.get(), self.T_rating.get(), self.T_vote.get()) # 返回符合条件的电影信息
|
||||
self.jsonData = jsonData
|
||||
self.add_tree(list, self.treeview) # 将数据添加到tree中
|
||||
break
|
||||
|
||||
# 按钮设置为正常状态
|
||||
self.B_0['state'] = NORMAL
|
||||
|
@ -423,6 +434,11 @@ class uiObject:
|
|||
|
||||
|
||||
def keyboard_T_vote_keyword(self, event):
|
||||
"""
|
||||
在搜索框中键入回车键后触发相应的事件
|
||||
:param event:
|
||||
:return:
|
||||
"""
|
||||
thread_it(self.searh_movie_in_keyword)
|
||||
|
||||
def searh_movie_in_keyword(self):
|
||||
|
@ -442,12 +458,19 @@ class uiObject:
|
|||
self.jsonData = ""
|
||||
|
||||
|
||||
res_data = get_url_data_in_keyWord(self.T_vote_keyword.get())
|
||||
if len(res_data) == 2:
|
||||
# 获取数据成功
|
||||
res_list = res_data[0]
|
||||
jsonData = res_data[1]
|
||||
|
||||
self.jsonData = jsonData
|
||||
self.add_tree(res_list, self.treeview) # 将数据添加到tree中
|
||||
else:
|
||||
# 获取数据失败,出现异常
|
||||
err_str = res_data[0]
|
||||
messagebox.showinfo('提示', err_str[:1000])
|
||||
|
||||
movieObject = getMovieInRankingList(); #创建对象
|
||||
list,jsonData = movieObject.get_url_data_in_keyWord(self.T_vote_keyword.get())
|
||||
self.jsonData = jsonData
|
||||
self.add_tree(list, self.treeview) # 将数据添加到tree中
|
||||
|
||||
|
||||
|
||||
|
@ -470,7 +493,7 @@ class uiObject:
|
|||
:return:
|
||||
"""
|
||||
item = self.treeview.selection()
|
||||
if(item):
|
||||
if item:
|
||||
item_text = self.treeview.item(item, "values")
|
||||
movieName = item_text[0]
|
||||
for movie in self.jsonData:
|
||||
|
@ -527,7 +550,7 @@ class uiObject:
|
|||
root = Tk()
|
||||
self.root = root
|
||||
# 设置窗口位置
|
||||
root.title("豆瓣电影小助手(可筛选、下载自定义电影)----吾爱破解论坛 www.52pojie.cn")
|
||||
root.title("豆瓣电影小助手(可筛选、下载自定义电影)")
|
||||
self.center_window(root, 1000, 565)
|
||||
root.resizable(0, 0) # 框体大小可调性,分别表示x,y方向的可变性
|
||||
|
||||
|
@ -565,7 +588,7 @@ class uiObject:
|
|||
# 文本框
|
||||
T_count = Entry(labelframe, width=5)
|
||||
T_count.delete(0, END)
|
||||
T_count.insert(0, '500')
|
||||
T_count.insert(0, '100')
|
||||
T_count.place(x=220, y=7)
|
||||
self.T_count = T_count
|
||||
|
||||
|
@ -906,7 +929,7 @@ class uiObject:
|
|||
|
||||
#项目的一些信息
|
||||
ft = font.Font(size=14, weight=font.BOLD)
|
||||
project_statement = Label(root, text="豆瓣电影小助手(可筛选、下载自定义电影)----吾爱破解论坛 www.52pojie.cn", fg='#FF0000', font=ft,anchor=NW)
|
||||
project_statement = Label(root, text="豆瓣电影小助手(可筛选、下载自定义电影)", fg='#FF0000', font=ft,anchor=NW)
|
||||
project_statement.place(x=5, y=540)
|
||||
self.project_statement = project_statement
|
||||
|
||||
|
|
Loading…
Reference in New Issue