本文已参加「新人创造礼」活动,一同敞开创造之路
1. 运用 requests 库和正则表达式抓取猫眼电影 TOP100 的相关内容
(1)猫眼电影网址如下,共10页。
maoyan.com/board/4?off… maoyan.com/board/4?off… … maoyan.com/board/4?off…
import re
import json
import time
import requests
from requests.exceptions import RequestException
#from fake_useragent import UserAgent
(2)界说get_one_page(url)办法,获取指定网页的源代码。
def get_one_page(url):
"""
发送请求,获取呼应!
:param url:
:return:
"""
try:
headers = {
'User-Agent':'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0'
}
response = requests.get(url,timeout=30, headers=headers)
if response.status_code == 200:
return response.text
return None
except RequestException:
return None
(3)界说parse_one_page(html)办法,解析源代码,获取每条电影信息。
def parse_one_page(html):
"""
运用正则表达式提取呼应里的电影信息,并构成结构化数据!
:param html:
:return:
"""
pattern = re.compile(
'<dd>'
'.*?board-index.*?>(.*?)</i>'#index
'.*?src="https://juejin.im/post/7079709897515859982/(.*?)"'#image
'.*?name.*?a.*?>(.*?)</a>'#title
'.*?star.*?>(.*?)</p>'#主演
'.*?releasetime.*?>(.*?)</p>'#上映时刻
'.*?integer.*?>(.*?)</i>'#评分 整数部分
'.*?fraction.*?>(.*?)</i>'#评分 小数部分
'.*?</dd>', re.S)
items = re.findall(pattern, str(html))
for item in items:
yield {
'index': item[0],
'image': item[1],
'title': item[2].strip(),
'actor': item[3].strip()[3:] if len(item[3]) > 3 else '',
'time' : item[4].strip()[5:] if len(item[4]) > 5 else '',
'score': item[5].strip() + item[6].strip()
}
(4)界说write_to_file(content)办法,将电影信息写入Excel文件中。
def write_to_file(content):
"""
存储数据,经过JSON库的dumps()办法完成字典的序列化,写入到一个文本文件!
:param content:
:return:
"""
with open('result.txt', 'a', encoding='utf-8') as f:
f.write(json.dumps(content, ensure_ascii=False) + ',\n')
(5)界说main(offset)办法,总合一切办法。
def main(offset):
"""
经过结构URL中的offset参数(偏移量值),完成TOP100十页数据的爬取!
:param offset:
:return:
"""
url = "http://maoyan.com/board/4?offset=" + str(offset)
html = get_one_page(url)
for item in parse_one_page(html):
print(item)
write_to_file(item)
(6)运用for循环遍历一切网址。
if __name__ == '__main__':
for i in range(9):
main(offset=i * 10)
time.sleep(5)
import re
import json
import time
import requests
from requests.exceptions import RequestException
#from fake_useragent import UserAgent
def get_one_page(url):
"""
发送请求,获取呼应!
:param url:
:return:
"""
try:
headers = {
'User-Agent':'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0'
}
response = requests.get(url,timeout=30, headers=headers)
if response.status_code == 200:
return response.text
return None
except RequestException:
return None
def parse_one_page(html):
"""
运用正则表达式提取呼应里的电影信息,并构成结构化数据!
:param html:
:return:
"""
pattern = re.compile('<dd>.*?board-index.*?>(.*?)'
'</i>.*?src="https://juejin.im/post/7079709897515859982/(.*?)".*?name.*?a.*?>(.*?)'
'</a>.*?star.*?>(.*?)'
'</p>.*?releasetime.*?>(.*?)'
'</p>.*?integer.*?>(.*?)'
'</i>.*?fraction.*?>(.*?)'
'</i>.*?</dd>',re.S)
items = re.findall(pattern, str(html))
for item in items:
yield {
'index': item[0],
'image': item[1],
'title': item[2].strip(),
'actor': item[3].strip()[3:] if len(item[3]) > 3 else '',
'time' : item[4].strip()[5:] if len(item[4]) > 5 else '',
'score': item[5].strip() + item[6].strip()
}
def write_to_file(content):
"""
存储数据,经过JSON库的dumps()办法完成字典的序列化,写入到一个文本文件!
:param content:
:return:
"""
with open('result.txt', 'a', encoding='utf-8') as f:
f.write(json.dumps(content, ensure_ascii=False) + ',\n')
def main(offset):
"""
经过结构URL中的offset参数(偏移量值),完成TOP100十页数据的爬取!
:param offset:
:return:
"""
url = "http://maoyan.com/board/4?offset=" + str(offset)
html = get_one_page(url)
for item in parse_one_page(html):
print(item)
write_to_file(item)
if __name__ == '__main__':
for i in range(1):
main(offset=i * 10)
time.sleep(5)
import re
import time
import requests
from requests.exceptions import RequestException
import xlwings as xw
#from fake_useragent import UserAgent
def getHTML(url):
try:
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36'
}
response = requests.get(url,timeout=30, headers=headers)
response.encoding = response.apparent_encoding
if response.status_code == 200:
return response.text
return None
except RequestException:
return None
def findMaoyan(html):
global mlist
pattern = re.compile(
'<dd>'
'.*?board-index.*?>(.*?)</i>'#index
'.*?src="https://juejin.im/post/7079709897515859982/(.*?)"'#image
'.*?name.*?a.*?>(.*?)</a>'#title
'.*?star.*?>(.*?)</p>'#主演
'.*?releasetime.*?>(.*?)</p>'#上映时刻
'.*?integer.*?>(.*?)</i>'#评分 整数部分
'.*?fraction.*?>(.*?)</i>'#评分 小数部分
'.*?</dd>', re.S)
items = re.findall(pattern,str(html))
for item in items:
mlist.append([item[0],#index
item[1],#image
item[2].strip(),#title
item[3].strip()[3:] if len(item[3]) > 3 else '',#主演
item[4].strip()[5:] if len(item[4]) > 5 else '',#上映时刻
item[5].strip() + item[6].strip()])#评分
#print(mlist)
return mlist
def main():
global mlist
mlist = [['index', 'image', 'title', '主演', '上映时刻', '评分']]
for i in range(10):
url = "http://maoyan.com/board/4?offset=" + str(i*10)
html = getHTML(url)
findMaoyan(html)
time.sleep(1)
# 写入Excel文件
wb = xw.Book()
sht = wb.sheets('Sheet1')
sht.range('a1').value = mlist # 将数据添加到表格中
if __name__ == '__main__':
main()