1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93
| import requests import pandas as pd from bs4 import BeautifulSoup import json import re
list_url = 'https://movie.douban.com/j/search_subjects?type=movie&tag=%E8%B1%86%E7%93%A3%E9%AB%98%E5%88%86&sort=rank&page_limit=500&page_start=0' url = 'https://movie.douban.com/subject/1292052/?tag=%E8%B1%86%E7%93%A3%E9%AB%98%E5%88%86&from=gaia_video'
def get_movie_list(url): res = requests.get(url) df = pd.DataFrame(json.loads(res.text)['subjects']) return df
def get_movie_info(url):
header = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36', 'Connection': 'keep-alive', 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3' , 'Referer': 'https://www.douban.com/' }
res = requests.get(url = url, headers=header) soup = BeautifulSoup(res.text,'html.parser') content = soup.select('#info span')
movie_info = {}
movie_info['director'] = soup.select('#info [rel="v:directedBy"]')[0].text movie_info['scriptwriter'] = content[5].text.strip().split('/') movie_info_actor = soup.select('#info [rel="v:starring"]') movie_info['actor']= [] for i in movie_info_actor: movie_info['actor'].append(i.text)
movie_info_type = soup.select('#info [property="v:genre"]') movie_info['type']= [] for i in movie_info_type: movie_info['type'].append(i.text)
movie_info['runtime'] = soup.select('#info [property="v:runtime"]')[0].text movie_info['launch_time'] = soup.select('#info [property="v:initialReleaseDate"]')[0].text if soup.select('#info [property="v:initialReleaseDate"]') else '无' county_pattern = re.compile(r'<span class="pl">制片国家/地区:</span>(.*)<br/>') movie_info['country'] = re.findall(county_pattern,str(soup))[0].strip()
language_pattern = re.compile(r'<span class="pl">语言:</span>(.*)<br/>') movie_info['language'] = re.findall(language_pattern,str(soup))[0].strip() movie_info['summary'] = soup.select('[property="v:summary"]')[0].text.strip() if soup.select('[property="v:summary"]') else '无' return movie_info
def merge_movie_info(df): data = {} data_list = [] count = 1 for i in df['id']: data = {} url = 'https://movie.douban.com/subject/' + str(i) info = get_movie_info(url) data['id'] = i data['director'] = info['director'] data['scriptwriter'] = info['scriptwriter'] data['actor'] = info['actor'] data['type'] = info['type'] data['runtime'] = info['runtime'] data['launch_time'] = info['launch_time'] data['country'] = info['country'] data['language'] = info['language'] data['summary'] = info['summary'] data_list.append(data) print('第' + str(count) +'条数据已加载,已下载' + str(count * 100/len(df)) + '%的数据' + url) count += 1
final_data = pd.merge(df,pd.DataFrame(data_list)) return final_data
df = get_movie_list(list_url) info_data = merge_movie_info(df) info_data_df = pd.DataFrame(info_data) info_data_df.to_csv('douban_movie_top500.csv') print('数据加载完成')
|