修复imdb评分接口失效等问题

This commit is contained in:
sqzhang 2022-03-18 16:12:18 +08:00
parent 16e15236ae
commit eb39dd76ad
6 changed files with 110 additions and 141 deletions

View File

@ -10,14 +10,12 @@
![](example_rating.png)
![](example_keyword.png)
## 如何运行
1. 打开Chrome浏览器在网址栏输入chrome://version/查询当前Chrome版本
2. 打开[http://chromedriver.storage.googleapis.com/index.html][1]下载对应版本的chromedriver驱动**下载完成后务必解压**
3. 打开当前目录下的文件`getMovieInRankingList.py`,定位到第`59行`,将`executable_path=./chromedriver.exe`修改为你的chromedriver驱动路径
3. 打开当前目录下的文件`getMovieInRankingList.py`,定位到第`107行`,将`executable_path=./chromedriver.exe`修改为你的chromedriver驱动路径
4. 执行命令`pip install -r requirement.txt`安装程序所需的依赖包
5. 执行命令`python main.py`运行程序
@ -33,15 +31,6 @@
- [ ] 等待更新
## 存在问题
目前没有加入反爬虫策略如果运行出现403 forbidden提示则说明暂时被禁止使用解决方式如下
- 加入cookies
- 采用随机延时方式
- 采用IP代理池方式(较不稳定)
## 补充
项目持续更新,欢迎您[star本项目][5]

Binary file not shown.

Before

Width:  |  Height:  |  Size: 348 KiB

View File

@ -104,7 +104,7 @@ def get_url_data_in_keyWord(key_word):
browser = None
wait = None
try:
browser = webdriver.Chrome(executable_path='./chromedriver', chrome_options=chrome_options) # 设置chromedriver驱动路径
browser = webdriver.Chrome(executable_path='./chromedriver.exe', chrome_options=chrome_options) # 设置chromedriver驱动路径
browser.set_page_load_timeout(10) # 页面加载超时时间为10s
browser.set_script_timeout(10) # 页面js加载超时时间为10s

View File

@ -25,6 +25,7 @@ from tkinter import DISABLED
from tkinter import NORMAL
from re import findall
from json import loads
from ssl import _create_unverified_context
from threading import Thread
from urllib.parse import quote
from webbrowser import open
@ -105,12 +106,20 @@ def resize(w_box, h_box, pil_image):
return pil_image.resize((width, height), Image.ANTIALIAS)
def get_mid_str(content,startStr,endStr):
startIndex = content.index(startStr)
if startIndex>=0:
def get_mid_str(content, startStr, endStr):
startIndex = content.find(startStr, 0) # 定位到起始字符串的首个字符,从起始位置开始查找
if startIndex >= 0:
startIndex += len(startStr)
endIndex = content.index(endStr)
return content[startIndex:endIndex]
else:
return ""
endIndex = content.find(endStr, startIndex) # 定位到结束字符串,要从起始字符串开始查找
if endIndex >= 0 and endIndex >= startIndex:
return content[startIndex:endIndex]
else:
return ""
class uiObject:
@ -152,133 +161,109 @@ class uiObject:
self.label_movie_rating_imdb.config(text='正在加载IMDB评分')
self.B_0_imdb['state'] = DISABLED
rating_imdb = '未知'
item = self.treeview.selection()
if(item):
if item:
item_text = self.treeview.item(item, "values")
movieName = item_text[0] # 输出电影名
movieName = item_text[0] # 输出电影名
for movie in self.jsonData:
if(movie['title'] == movieName):
f = urllib.request.urlopen(movie['url'])
response = (f.read()).decode()
url_imdb = get_mid_str(response, 'IMDb链接:</span> <a href=\"', '\" target=\"_blank\" rel=\"nofollow\">')
f = urllib.request.urlopen(url_imdb)
data_imdb = (f.read()).decode()
rating_imdb = get_mid_str(data_imdb, '<span class=\"rating">', '<span class=\"ofTen\">')
if movie['title'] == movieName:
context = _create_unverified_context() # 屏蔽ssl证书
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'}
req = urllib.request.Request(url=movie['url'], headers=headers)
f = urllib.request.urlopen(req, context=context)
response = f.read().decode()
self.clear_tree(self.treeview_play_online)
s = response
name = findall(r"<a class=\"playBtn\" data-cn=\"(.*?)\" href=\"", s)
down_url = findall(r"data-cn=\".*?\" href=\"(.*?)\" target=", s)
real_movie_name = get_mid_str(s, "<title>", "</title>").replace(" ","").replace("\n","").replace("(豆瓣)","")
print(real_movie_name)
list = []
name = findall(r'<a class="playBtn" data-cn="(.*?)" data-impression-track', s)
down_url = findall(r'data-cn=".*?" href="(.*?)" target=', s)
res_list = []
for i in range(len(name)):
list.append([name[i], "限VIP免费", down_url[i]])
list.append(["4K屋", "免费", "http://www.4kwu.cc/?m=vod-search&wd=" + quote(real_movie_name)])
list.append(["91黑米", "免费", "http://www.91heimi.com/index.php/vod/search.html?wd=" + quote(real_movie_name)])
list.append(["AAQQS", "免费", "http://aaxxy.com/vod-search-pg-1-wd-" + quote(real_movie_name) + ".html"])
list.append(["Neets", "免费", "http://neets.cc/search?key=" + quote(real_movie_name)])
list.append(["Q2电影网", "免费", "http://www.q2002.com/search?wd=" + quote(real_movie_name)])
list.append(["霸气村", "免费", "http://www.baqicun.co/search.php?searchword=" + quote(real_movie_name)])
list.append(["魔力电影网", "免费", "http://www.magbt.net/search.php?searchword=" + quote(real_movie_name)])
list.append(["新论语", "免费", "http://www.honggujiu.net/index.php?m=vod-search&wd=" + quote(real_movie_name)])
list.append(["左手吃斋", "免费", "https://www.xiangkanju.com/index.php?m=vod-search&wd=" + quote(real_movie_name)])
self.add_tree(list, self.treeview_play_online)
res_list.append([name[i], "限VIP免费", down_url[i]])
self.add_tree(res_list, self.treeview_play_online)
self.clear_tree(self.treeview_save_cloud_disk)
list = []
list.append(["56网盘搜索", "有效", "https://www.56wangpan.com/search/o2kw" + quote(real_movie_name)])
list.append(["爱搜资源", "有效", "https://www.aisouziyuan.com/?name=" + quote(real_movie_name) + "&page=1"])
list.append(["盘多多", "有效", "http://www.panduoduo.net/s/comb/n-" + quote(real_movie_name) + "&f-f4"])
list.append(["小白盘", "有效", "https://www.xiaobaipan.com/list-" + quote(real_movie_name) + "-1.html" ])
list.append(["云盘精灵", "有效", "https://www.yunpanjingling.com/search/" + quote(real_movie_name) + "?sort=size.desc"])
self.add_tree(list, self.treeview_save_cloud_disk)
res_list = []
res_list.append(["56网盘搜索", "有效", "https://www.56wangpan.com/search/o2kw" + quote(movie['title'])])
res_list.append(["爱搜资源", "有效", "https://www.aisouziyuan.com/?name=" + quote(movie['title']) + "&page=1"])
res_list.append(["盘多多", "有效", "http://www.panduoduo.net/s/comb/n-" + quote(movie['title']) + "&f-f4"])
res_list.append(["小白盘", "有效", "https://www.xiaobaipan.com/list-" + quote(movie['title']) + "-1.html" ])
res_list.append(["云盘精灵", "有效", "https://www.yunpanjingling.com/search/" + quote(movie['title']) + "?sort=size.desc"])
self.add_tree(res_list, self.treeview_save_cloud_disk)
self.clear_tree(self.treeview_bt_download)
list = []
list.append( ['19影视', '有效', 'https://www.19kan.com/vodsearch.html?wd=' + quote(real_movie_name) ])
list.append( ['2TU影院', '有效', 'http://www.82tu.cc/search.php?submit=%E6%90%9C+%E7%B4%A2&searchword=' + quote(real_movie_name) ])
list.append( ['4K电影', '有效', 'https://www.dygc.org/?s=' + quote(real_movie_name) ])
list.append( ['52 Movie', '有效', 'http://www.52movieba.com/search.htm?keyword=' + quote(real_movie_name) ])
list.append( ['592美剧', '有效', 'http://www.592meiju.com/search/?wd=' + quote(real_movie_name) ])
list.append( ['97电影网', '有效', 'http://www.55xia.com/search?q=' + quote(real_movie_name) ])
list.append( ['98TVS', '有效', 'http://www.98tvs.com/?s=' + quote(real_movie_name) ])
list.append( ['9去这里', '有效', 'http://9qzl.com/index.php?s=/video/search/wd/' + quote(real_movie_name) ])
list.append( ['CK电影', '有效', 'http://www.ck180.net/search.html?q=' + quote(real_movie_name) ])
list.append( ['LOL电影', '有效', 'http://www.993dy.com/index.php?m=vod-search&wd=' + quote(real_movie_name) ])
list.append( ['MP4Vv', '有效', 'http://www.mp4pa.com/search.php?searchword=' + quote(real_movie_name) ])
list.append( ['MP4电影', '有效', 'http://www.domp4.com/search/' + quote(real_movie_name) + '-1.html'])
list.append( ['TL95', '有效', 'http://www.tl95.com/?s=' + quote(real_movie_name) ])
list.append( ['比特大雄', '有效', 'https://www.btdx8.com/?s=' + quote(real_movie_name) ])
list.append( ['比特影视', '有效', 'https://www.bteye.com/search/' + quote(real_movie_name) ])
list.append( ['春晓影视', '有效', 'http://search.chunxiao.tv/?keyword=' + quote(real_movie_name) ])
list.append( ['第一电影网', '有效', 'https://www.001d.com/?s=' + quote(real_movie_name) ])
list.append( ['电影日志', '有效', 'http://www.dyrizhi.com/search?s=' + quote(real_movie_name) ])
list.append( ['高清888', '有效', 'https://www.gaoqing888.com/search?kw=' + quote(real_movie_name) ])
list.append( ['高清MP4', '有效', 'http://www.mp4ba.com/index.php?m=vod-search&wd=' + quote(real_movie_name) ])
list.append( ['高清电台', '有效', 'https://gaoqing.fm/s.php?q=' + quote(real_movie_name) ])
list.append( ['高清控', '有效', 'http://www.gaoqingkong.com/?s=' + quote(real_movie_name) ])
list.append( ['界绍部', '有效', 'http://www.jsb456.com/?s=' + quote(real_movie_name) ])
list.append( ['看美剧', '有效', 'http://www.kanmeiju.net/index.php?s=/video/search/wd/' + quote(real_movie_name) ])
list.append( ['蓝光网', '有效', 'http://www.languang.co/?s=' + quote(real_movie_name) ])
list.append( ['老司机电影', '有效', 'http://www.lsjdyw.net/search/?s=' + quote(real_movie_name) ])
list.append( ["乐赏电影", '有效', 'http://www.gscq.me/search.htm?keyword=' + quote(real_movie_name) ])
list.append( ["美剧汇", '有效', 'http://www.meijuhui.net/search.php?q=' + quote(real_movie_name) ])
list.append( ['美剧鸟', '有效', 'http://www.meijuniao.com/index.php?s=vod-search-wd-' + quote(real_movie_name) ])
list.append( ['迷你MP4', '有效', 'http://www.minimp4.com/search?q=' + quote(real_movie_name) ])
list.append( ['泡饭影视', '有效', 'http://www.chapaofan.com/search/' + quote(real_movie_name) ])
list.append( ['片吧', '有效', 'http://so.pianbar.com/search.aspx?q=' + quote(real_movie_name) ])
list.append( ['片源网', '有效', 'http://pianyuan.net/search?q=' + quote(real_movie_name) ])
list.append( ['飘花资源网', '有效', 'https://www.piaohua.com/plus/search.php?kwtype=0&keyword=' + quote(real_movie_name) ])
list.append( ['趣味源', '有效', 'http://quweiyuan.cc/?s=' + quote(real_movie_name) ])
list.append( ['人生05', '有效', 'http://www.rs05.com/search.php?s=' + quote(real_movie_name) ])
list.append( ['贪玩影视', '有效', 'http://www.tanwanyingshi.com/movie/search?keyword=' + quote(real_movie_name) ])
list.append( ['新片网', '有效', 'http://www.91xinpian.com/index.php?m=vod-search&wd=' + quote(real_movie_name) ])
list.append( ['迅雷影天堂', '有效', 'https://www.xl720.com/?s=' + quote(real_movie_name) ])
list.append( ['迅影网', '有效', 'http://www.xunyingwang.com/search?q=' + quote(real_movie_name) ])
list.append( ['一只大榴莲', '有效', 'http://www.llduang.com/?s=' + quote(real_movie_name) ])
list.append( ['音范丝', '有效', 'http://www.yinfans.com/?s=' + quote(real_movie_name) ])
list.append( ['影海', '有效', 'http://www.yinghub.com/search/list.html?keyword=' + quote(real_movie_name) ])
list.append( ['影视看看', '有效', 'http://www.yskk.tv/index.php?m=vod-search&wd=' + quote(real_movie_name) ])
list.append( ['云播网', '有效', 'http://www.yunbowang.cn/index.php?m=vod-search&wd=' + quote(real_movie_name) ])
list.append( ['中国高清网', '有效', 'http://gaoqing.la/?s=' + quote(real_movie_name) ])
list.append( ['最新影视站', '有效', 'http://www.zxysz.com/?s=' + quote(real_movie_name) ])
self.add_tree(list, self.treeview_bt_download)
res_list = []
res_list.append(['19影视', '有效', 'https://www.19kan.com/vodsearch.html?wd=' + quote(movie['title'])])
res_list.append(['2TU影院', '有效', 'http://www.82tu.cc/search.php?submit=%E6%90%9C+%E7%B4%A2&searchword=' + quote(movie['title'])])
res_list.append(['4K电影', '有效', 'https://www.dygc.org/?s=' + quote(movie['title'])])
res_list.append(['52 Movie', '有效', 'http://www.52movieba.com/search.htm?keyword=' + quote(movie['title'])])
res_list.append(['592美剧', '有效', 'http://www.592meiju.com/search/?wd=' + quote(movie['title'])])
res_list.append(['97电影网', '有效', 'http://www.55xia.com/search?q=' + quote(movie['title'])])
res_list.append(['98TVS', '有效', 'http://www.98tvs.com/?s=' + quote(movie['title'])])
res_list.append(['9去这里', '有效', 'http://9qzl.com/index.php?s=/video/search/wd/' + quote(movie['title'])])
res_list.append(['CK电影', '有效', 'http://www.ck180.net/search.html?q=' + quote(movie['title'])])
res_list.append(['LOL电影', '有效', 'http://www.993dy.com/index.php?m=vod-search&wd=' + quote(movie['title'])])
res_list.append(['MP4Vv', '有效', 'http://www.mp4pa.com/search.php?searchword=' + quote(movie['title'])])
res_list.append(['MP4电影', '有效', 'http://www.domp4.com/search/' + quote(movie['title']) + '-1.html'])
res_list.append(['TL95', '有效', 'http://www.tl95.com/?s=' + quote(movie['title'])])
res_list.append(['比特大雄', '有效', 'https://www.btdx8.com/?s=' + quote(movie['title'])])
res_list.append(['比特影视', '有效', 'https://www.bteye.com/search/' + quote(movie['title'])])
res_list.append(['春晓影视', '有效', 'http://search.chunxiao.tv/?keyword=' + quote(movie['title'])])
res_list.append(['第一电影网', '有效', 'https://www.001d.com/?s=' + quote(movie['title'])])
res_list.append(['电影日志', '有效', 'http://www.dyrizhi.com/search?s=' + quote(movie['title'])])
res_list.append(['高清888', '有效', 'https://www.gaoqing888.com/search?kw=' + quote(movie['title'])])
res_list.append(['高清MP4', '有效', 'http://www.mp4ba.com/index.php?m=vod-search&wd=' + quote(movie['title'])])
res_list.append(['高清电台', '有效', 'https://gaoqing.fm/s.php?q=' + quote(movie['title'])])
res_list.append(['高清控', '有效', 'http://www.gaoqingkong.com/?s=' + quote(movie['title'])])
res_list.append(['界绍部', '有效', 'http://www.jsb456.com/?s=' + quote(movie['title'])])
res_list.append(['看美剧', '有效', 'http://www.kanmeiju.net/index.php?s=/video/search/wd/' + quote(movie['title'])])
res_list.append(['蓝光网', '有效', 'http://www.languang.co/?s=' + quote(movie['title'])])
res_list.append(['老司机电影', '有效', 'http://www.lsjdyw.net/search/?s=' + quote(movie['title'])])
res_list.append(["乐赏电影", '有效', 'http://www.gscq.me/search.htm?keyword=' + quote(movie['title'])])
res_list.append(["美剧汇", '有效', 'http://www.meijuhui.net/search.php?q=' + quote(movie['title'])])
res_list.append(['美剧鸟', '有效', 'http://www.meijuniao.com/index.php?s=vod-search-wd-' + quote(movie['title'])])
res_list.append(['迷你MP4', '有效', 'http://www.minimp4.com/search?q=' + quote(movie['title'])])
res_list.append(['泡饭影视', '有效', 'http://www.chapaofan.com/search/' + quote(movie['title'])])
res_list.append(['片吧', '有效', 'http://so.pianbar.com/search.aspx?q=' + quote(movie['title'])])
res_list.append(['片源网', '有效', 'http://pianyuan.net/search?q=' + quote(movie['title'])])
res_list.append(['飘花资源网', '有效', 'https://www.piaohua.com/plus/search.php?kwtype=0&keyword=' + quote(movie['title'])])
res_list.append(['趣味源', '有效', 'http://quweiyuan.cc/?s=' + quote(movie['title'])])
res_list.append(['人生05', '有效', 'http://www.rs05.com/search.php?s=' + quote(movie['title'])])
res_list.append(['贪玩影视', '有效', 'http://www.tanwanyingshi.com/movie/search?keyword=' + quote(movie['title'])])
res_list.append(['新片网', '有效', 'http://www.91xinpian.com/index.php?m=vod-search&wd=' + quote(movie['title'])])
res_list.append(['迅雷影天堂', '有效', 'https://www.xl720.com/?s=' + quote(movie['title'])])
res_list.append(['迅影网', '有效', 'http://www.xunyingwang.com/search?q=' + quote(movie['title'])])
res_list.append(['一只大榴莲', '有效', 'http://www.llduang.com/?s=' + quote(movie['title'])])
res_list.append(['音范丝', '有效', 'http://www.yinfans.com/?s=' + quote(movie['title'])])
res_list.append(['影海', '有效', 'http://www.yinghub.com/search/list.html?keyword=' + quote(movie['title'])])
res_list.append(['影视看看', '有效', 'http://www.yskk.tv/index.php?m=vod-search&wd=' + quote(movie['title'])])
res_list.append(['云播网', '有效', 'http://www.yunbowang.cn/index.php?m=vod-search&wd=' + quote(movie['title'])])
res_list.append(['中国高清网', '有效', 'http://gaoqing.la/?s=' + quote(movie['title'])])
res_list.append(['最新影视站', '有效', 'http://www.zxysz.com/?s=' + quote(movie['title'])])
self.add_tree(res_list, self.treeview_bt_download)
imdb_num = get_mid_str(response, 'IMDb:</span>', '<br>').strip()
imdb_url = "https://www.imdb.com/title/{}/".format(imdb_num)
print("电影名:{}, IMDb:{}".format(movie['title'], imdb_num))
break;
f = urllib.request.urlopen(imdb_url)
data_imdb = f.read().decode()
rating_imdb = get_mid_str(data_imdb, '{"@type":"AggregateRating"', '}')
rating_imdb = rating_imdb.split(":")[-1]
self.label_movie_rating_imdb.config(text='IMDB评分:' + rating_imdb + '')
self.label_movie_rating_imdb.config(text='IMDB评分:' + rating_imdb + '')
self.B_0_imdb['state'] = NORMAL
def project_statement_show(self, event):
open("https://github.com/shengqiangzhang/examples-of-web-crawlers")
@ -929,7 +914,7 @@ class uiObject:
#项目的一些信息
ft = font.Font(size=14, weight=font.BOLD)
project_statement = Label(root, text="豆瓣电影小助手(可筛选、下载自定义电影)", fg='#FF0000', font=ft,anchor=NW)
project_statement = Label(root, text="1.鼠标双击可打开相应的链接, 2.点击初始化按钮后将显示完整信息", fg='#FF0000', font=ft,anchor=NW)
project_statement.place(x=5, y=540)
self.project_statement = project_statement

View File

@ -223,24 +223,26 @@ from contextlib import closing
## [6.爬取豆瓣排行榜电影数据(含GUI界面版)][getMovieInRankingList]
### 简介
### 项目简介
这个项目源于大三某课程设计。平常经常需要搜索一些电影,但是不知道哪些评分高且评价人数多的电影。为了方便使用,就将原来的项目重新改写了。当做是对爬虫技术、可视化技术的实践了。主要是通过从排行榜和从影片关键词两种方式爬取电影数据。
### 使用教程
1. 打开[http://chromedriver.storage.googleapis.com/index.html][9]根据自己的操作系统下载对应的chromedriver
2. 打开当前面目录下的**<u>getMovieInRankingList.py</u>**定位到第59行将`executable_path=/Users/bird/Desktop/chromedriver.exe`修改成你自己的chromedriver路径
3. 打开pycharm依次安装以下包
- pip install Pillow
- pip install selenium
### 演示图片
### 功能截图
![](6.爬取豆瓣排行榜电影数据(含GUI界面版)/example_rating.png)
![](6.爬取豆瓣排行榜电影数据(含GUI界面版)/example_keyword.png)
### 包含功能
## 如何运行
1. 打开Chrome浏览器在网址栏输入chrome://version/查询当前Chrome版本
2. 打开[http://chromedriver.storage.googleapis.com/index.html][1]下载对应版本的chromedriver驱动**下载完成后务必解压**
3. 打开当前目录下的文件`getMovieInRankingList.py`,定位到第`107行`,将`executable_path=./chromedriver.exe`修改为你的chromedriver驱动路径
4. 执行命令`pip install -r requirement.txt`安装程序所需的依赖包
5. 执行命令`python main.py`运行程序
## 包含功能
- [x] 根据关键字搜索电影
- [x] 根据排行榜(TOP250)搜索电影
@ -251,13 +253,6 @@ from contextlib import closing
- [ ] 等待更新
### 存在问题
目前没有加入反爬虫策略如果运行出现403 forbidden提示则说明暂时被禁止解决方式如下
- 加入cookies
- 采用随机延时方式
- 采用IP代理池方式(较不稳定)