本文已参加「新人创造礼」活动,一同敞开创造之路

1. 运用 requests 库和正则表达式抓取猫眼电影 TOP100 的相关内容

(1)猫眼电影网址如下,共10页。

Python 抓取猫眼电影TOP100数据
maoyan.com/board/4?off… maoyan.com/board/4?off… … maoyan.com/board/4?off…

import re
import json
import time
import requests
from requests.exceptions import RequestException
#from fake_useragent import UserAgent

(2)界说get_one_page(url)办法,获取指定网页的源代码。

def get_one_page(url):
    """
    发送请求,获取呼应!
    :param url:
    :return:
    """
    try:
        headers = {
            'User-Agent':'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0'
        }
        response = requests.get(url,timeout=30, headers=headers)
        if response.status_code == 200:
            return response.text
        return None
    except RequestException:
        return None

(3)界说parse_one_page(html)办法,解析源代码,获取每条电影信息。

Python 抓取猫眼电影TOP100数据

def parse_one_page(html):
    """
    运用正则表达式提取呼应里的电影信息,并构成结构化数据!
    :param html:
    :return:
    """
    pattern = re.compile(
        '<dd>'
        '.*?board-index.*?>(.*?)</i>'#index
        '.*?src="https://juejin.im/post/7079709897515859982/(.*?)"'#image
        '.*?name.*?a.*?>(.*?)</a>'#title
        '.*?star.*?>(.*?)</p>'#主演
        '.*?releasetime.*?>(.*?)</p>'#上映时刻
        '.*?integer.*?>(.*?)</i>'#评分 整数部分
        '.*?fraction.*?>(.*?)</i>'#评分 小数部分
        '.*?</dd>', re.S)
    items = re.findall(pattern, str(html))
    for item in items:
        yield {
            'index': item[0],
            'image': item[1],
            'title': item[2].strip(),
            'actor': item[3].strip()[3:] if len(item[3]) > 3 else '',
            'time' : item[4].strip()[5:] if len(item[4]) > 5 else '',
            'score': item[5].strip() + item[6].strip()
        }

(4)界说write_to_file(content)办法,将电影信息写入Excel文件中。

def write_to_file(content):
    """
    存储数据,经过JSON库的dumps()办法完成字典的序列化,写入到一个文本文件!
    :param content:
    :return:
    """
    with open('result.txt', 'a', encoding='utf-8') as f:
        f.write(json.dumps(content, ensure_ascii=False) + ',\n')

(5)界说main(offset)办法,总合一切办法。

def main(offset):
    """
    经过结构URL中的offset参数(偏移量值),完成TOP100十页数据的爬取!
    :param offset:
    :return:
    """
    url = "http://maoyan.com/board/4?offset=" + str(offset)
    html = get_one_page(url)
    for item in parse_one_page(html):
        print(item)
        write_to_file(item)

(6)运用for循环遍历一切网址。

if __name__ == '__main__':
    for i in range(9):
        main(offset=i * 10)
        time.sleep(5)
import re
import json
import time
import requests
from requests.exceptions import RequestException
#from fake_useragent import UserAgent
def get_one_page(url):
    """
    发送请求,获取呼应!
    :param url:
    :return:
    """
    try:
        headers = {
            'User-Agent':'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0'
        }
        response = requests.get(url,timeout=30, headers=headers)
        if response.status_code == 200:
            return response.text
        return None
    except RequestException:
        return None
def parse_one_page(html):
    """
    运用正则表达式提取呼应里的电影信息,并构成结构化数据!
    :param html:
    :return:
    """
    pattern = re.compile('<dd>.*?board-index.*?>(.*?)'
                         '</i>.*?src="https://juejin.im/post/7079709897515859982/(.*?)".*?name.*?a.*?>(.*?)'
                         '</a>.*?star.*?>(.*?)'
                         '</p>.*?releasetime.*?>(.*?)'
                         '</p>.*?integer.*?>(.*?)'
                         '</i>.*?fraction.*?>(.*?)'
                         '</i>.*?</dd>',re.S)
    items = re.findall(pattern, str(html))
    for item in items:
        yield {
            'index': item[0],
            'image': item[1],
            'title': item[2].strip(),
            'actor': item[3].strip()[3:] if len(item[3]) > 3 else '',
            'time' : item[4].strip()[5:] if len(item[4]) > 5 else '',
            'score': item[5].strip() + item[6].strip()
        }
def write_to_file(content):
    """
    存储数据,经过JSON库的dumps()办法完成字典的序列化,写入到一个文本文件!
    :param content:
    :return:
    """
    with open('result.txt', 'a', encoding='utf-8') as f:
        f.write(json.dumps(content, ensure_ascii=False) + ',\n')
def main(offset):
    """
    经过结构URL中的offset参数(偏移量值),完成TOP100十页数据的爬取!
    :param offset:
    :return:
    """
    url = "http://maoyan.com/board/4?offset=" + str(offset)
    html = get_one_page(url)
    for item in parse_one_page(html):
        print(item)
        write_to_file(item)
if __name__ == '__main__':
    for i in range(1):
        main(offset=i * 10)
        time.sleep(5)
import re
import time
import requests
from requests.exceptions import RequestException
import xlwings as xw
#from fake_useragent import UserAgent
def getHTML(url):
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36'
        }
        response = requests.get(url,timeout=30, headers=headers)
        response.encoding = response.apparent_encoding
        if response.status_code == 200:
            return response.text
        return None
    except RequestException:
        return None
def findMaoyan(html):
    global mlist
    pattern = re.compile(
        '<dd>'
        '.*?board-index.*?>(.*?)</i>'#index
        '.*?src="https://juejin.im/post/7079709897515859982/(.*?)"'#image
        '.*?name.*?a.*?>(.*?)</a>'#title
        '.*?star.*?>(.*?)</p>'#主演
        '.*?releasetime.*?>(.*?)</p>'#上映时刻
        '.*?integer.*?>(.*?)</i>'#评分 整数部分
        '.*?fraction.*?>(.*?)</i>'#评分 小数部分
        '.*?</dd>', re.S)
    items = re.findall(pattern,str(html))
    for item in items:
        mlist.append([item[0],#index
                      item[1],#image
                      item[2].strip(),#title
                      item[3].strip()[3:] if len(item[3]) > 3 else '',#主演
                      item[4].strip()[5:] if len(item[4]) > 5 else '',#上映时刻
                      item[5].strip() + item[6].strip()])#评分
    #print(mlist)
    return mlist
def main():
    global mlist
    mlist = [['index', 'image', 'title', '主演', '上映时刻', '评分']]
    for i in range(10):
        url = "http://maoyan.com/board/4?offset=" + str(i*10)
        html = getHTML(url)
        findMaoyan(html)
        time.sleep(1)
    # 写入Excel文件
    wb = xw.Book()
    sht = wb.sheets('Sheet1')
    sht.range('a1').value = mlist  # 将数据添加到表格中
if __name__ == '__main__':
    main()

Python 抓取猫眼电影TOP100数据
Python 抓取猫眼电影TOP100数据