Python 爬取微博数据代码分享(二)

2,246次阅读
没有评论
# encoding:utf-8
import requests
import httpx
import json
from bs4 import BeautifulSoup
import time
import csv
import random

requests = requests.session()  # 建立一个Session

cookitext = input('请输入COOKIE:').replace('\n', '')

headers = {
    'accept': 'application/json, text/plain, */*',
    'accept-encoding': 'gzip, deflate, br',
    'accept-language': 'zh-CN,zh;q=0.9',
    'cookie': cookitext,
    'referer': 'https://weibo.com/2750621294/KAf1AFVPD',
    'sec-ch-ua': '"Chromium";v="94", "Google Chrome";v="94", ";Not A Brand";v="99"',
    'sec-ch-ua-mobile': '?0',
    'sec-ch-ua-platform': '"Windows"',
    'sec-fetch-dest': 'empty',
    'sec-fetch-mode': 'cors',
    'sec-fetch-site': 'same-origin',
    'traceparent': '00-5b5f81f871c6ff6846bf3a92f1d5efed-1ab32a39ad75711a-00',
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36',
    'x-requested-with': 'XMLHttpRequest',
    'x-xsrf-token': '7yIZGS_IPx7EteZ6TT86YYAZ',
}


def getWeiboCommentinfo(url):
    """
    主要是获取微博的信息,内容以及这个微博 MID UID,
    :param url:
    :return:
    """
    headers = {
        'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
        'accept-encoding': 'gzip, deflate, br',
        'accept-language': 'zh-CN,zh;q=0.9',
        'cache-control': 'max-age=0',
        'cookie': cookitext,
        'referer': 'https://www.baidu.com/link?url=79KIn7lPAsM1SqpiE6ub8unuDW2xwxX-4CyvQvA8HLS&wd=&eqid=dfbc01160004d2dc00000004619e5581',
        'sec-ch-ua': '"Chromium";v="94", "Google Chrome";v="94", ";Not A Brand";v="99"',
        'sec-ch-ua-mobile': '?0',
        'sec-ch-ua-platform': '"Windows"',
        'sec-fetch-dest': 'document',
        'sec-fetch-mode': 'navigate',
        'sec-fetch-site': 'same-origin',
        'sec-fetch-user': '?1',
        'upgrade-insecure-requests': '1',
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36',
    }

    response = requests.get(url, headers=headers)
    html = BeautifulSoup(response.content, 'lxml')
    conetnt = html.find_all('div', class_="card-wrap")  # 这里CALSS 要加下划线
    for ct in conetnt:
        user_info = ct.find_all('a', class_="name")
        if user_info != []:
            try:
                mid = ct['mid']  # 获取微博ID
            except:
                pass
            else:
                user_name = user_info[0].text  # 用户名称
                uid = str(ct.find('div', class_="avator").find('a')['href']).split('/')[-1].split("?")[0]  # 获取UID
                user_index = "https:" + user_info[0]['href']  # 用户主页
                user_from = str(ct.find('p', class_="from").text).replace(' ', '').replace('\n', '')  # 时间和发布终端设备名称
                weibo_content = str(ct.find('p', class_="txt").text).replace(' ', '').replace('\n', '')  # 微博内容


                data = [weibo_content, user_name, user_from, user_index, mid, uid]

                max_id = 0
                htmlComment(data)
                getCommentLevel1(data, max_id)


def getCommentLevel1(data, max_id):
    """
    一级评论

    :return:
    """
    mid = data[-2]
    uid = data[-1]

    url = "https://weibo.com/ajax/statuses/buildComments?"

    par = {
        'id': mid,
        'is_show_bulletin': '2',
        'is_mix': '0',
        'max_id': max_id,
        'count': '20',
        'uid': uid,
    }
    client = httpx.Client(http2=True, verify=False)
    response = client.get(url, params=par, headers=headers)
    jsondata = json.loads(response.text)
    max_id = jsondata['max_id']  # 获取下一页mid
    content = jsondata['data']
    for ct in content:
        created_at = ct['created_at']  # 评论时间
        struct_time = time.strptime(created_at, '%a %b %d %H:%M:%S %z %Y')  # 评论时间
        time_array = time.strftime("%Y-%m-%d %H:%M:%S", struct_time)  # 评论时间
        text = ct['text_raw']  # 评论内容
        screen_name = ct['user']['screen_name']  # 评论人名称
        weibo_comment_data = data + [text, time_array, screen_name]
        saveCsv("微博信息_评论", weibo_comment_data)

    if max_id == 0:
        pass
    else:
        getCommentLevel1(data, max_id)


def htmlComment(data):

    mid = data[-2]
    uid = data[-1]
    url = 'https://s.weibo.com/Ajax_Comment/small?'
    par = {
        'act': 'list',
        'mid': mid,
        'uid': uid,
        'smartFlag': 'false',
        'smartCardComment': '',
        'isMain': 'true',
        'pageid': 'weibo',
        '_t': '0',
    }
    client = httpx.Client(http2=True, verify=False)
    response = client.get(url, params=par, headers=headers)
    jsondata = json.loads(response.text)['data']['html']
    html = BeautifulSoup(jsondata, 'lxml')
    comment_content = html.find_all('div', class_="content")
    for cc in comment_content:
        comment_info = str(cc.find('div', class_='txt').text).replace('\n', '').replace(' ', '').split(':')
        comment_text = comment_info[-1]
        comment_user = comment_info[0]
        comment_time = cc.find('p', class_="from").text
        weibo_comment_data = data + [comment_text, comment_time, comment_user]
        saveCsv("微博信息_评论", weibo_comment_data)


def runx():
    keytext = input('请输入关键词:')
    n = 0
    for x in range(1, 51):
        url = f"https://s.weibo.com/weibo?q={keytext}&page={x}"

        t = random.randint(2,5)

        print(f"{t}秒后开始抓取")
        time.sleep(t)
        getWeiboCommentinfo(url)



def saveCsv(filename, content):
    fp = open(f"{filename}.csv", 'a+', encoding='utf-8-sig', newline='')
    csv_fp = csv.writer(fp)
    csv_fp.writerow(content)
    fp.close()
    print(f"成功写入:{content}")


if __name__ == '__main__':
    runx()

如果对你有帮助,记得点个收藏呗!

来自面具技术网

2
liuze
版权声明:本站原创文章,由 liuze2022-01-30发表,共计4407字。
转载说明:除特殊说明外本站文章皆由CC-4.0协议发布,转载请注明出处。
评论(没有评论)
载入中...